[llvm] 692d46d - [NFC][X86][Codegen] Fix test coverage for AVX512 i1 mask replication

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 13 14:52:25 PST 2022


Author: Roman Lebedev
Date: 2022-12-14T01:50:53+03:00
New Revision: 692d46d839a7253669349a18555f3855b3476844

URL: https://github.com/llvm/llvm-project/commit/692d46d839a7253669349a18555f3855b3476844
DIFF: https://github.com/llvm/llvm-project/commit/692d46d839a7253669349a18555f3855b3476844.diff

LOG: [NFC][X86][Codegen] Fix test coverage for AVX512 i1 mask replication

At least some of the seemingly-bad codegen was because we really need to
consume the replicated i1 mask by the mask-consuming instruction
if we want faithful codegen for the replication shuffle.

But some of the odd codegen is still there.

Added: 
    

Modified: 
    llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 1fac1a049095..4ad271dc2370 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -8,44 +8,48 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-SLOW,FALLBACK6
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-FAST,FALLBACK7
 
-define void @mask_replication_factor2_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor2_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf2:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512F-ONLY-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512F-ONLY-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512F-ONLY-NEXT:    vptestmd %xmm0, %xmm0, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
-; AVX512F-ONLY-NEXT:    movb %al, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %xmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa %xmm0, (%rdx)
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor2_vf2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %xmm0
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512DQ-NEXT:    vpmovd2m %xmm0, %k0
-; AVX512DQ-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %xmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %xmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rdx)
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor2_vf2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    kmovq (%rdi), %k1
 ; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512BW-NEXT:    vptestmd %xmm0, %xmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    movb %al, (%rsi)
+; AVX512BW-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %xmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rdx)
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <2 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-  store <4 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
+  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %data = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %in.vec, i32 64, <4 x i1> %tgt.mask, <4 x i32> poison)
+  %data.padded = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <4 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor2_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor2_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-SLOW-LABEL: mask_replication_factor2_vf4:
 ; AVX512F-SLOW:       # %bb.0:
 ; AVX512F-SLOW-NEXT:    kmovw (%rdi), %k1
@@ -53,9 +57,9 @@ define void @mask_replication_factor2_vf4(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512F-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512F-SLOW-NEXT:    kmovw %k0, %eax
-; AVX512F-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512F-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512F-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
@@ -66,100 +70,106 @@ define void @mask_replication_factor2_vf4(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
 ; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512F-FAST-NEXT:    kmovw %k0, %eax
-; AVX512F-FAST-NEXT:    movb %al, (%rsi)
+; AVX512F-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512F-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-FAST-NEXT:    vzeroupper
 ; AVX512F-FAST-NEXT:    retq
 ;
 ; AVX512DQ-SLOW-LABEL: mask_replication_factor2_vf4:
 ; AVX512DQ-SLOW:       # %bb.0:
-; AVX512DQ-SLOW-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-SLOW-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %ymm0
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512DQ-SLOW-NEXT:    vpmovd2m %ymm0, %k0
-; AVX512DQ-SLOW-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %ymm0, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
 ; AVX512DQ-SLOW-NEXT:    retq
 ;
 ; AVX512DQ-FAST-LABEL: mask_replication_factor2_vf4:
 ; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-FAST-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
 ; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512DQ-FAST-NEXT:    vpmovd2m %ymm0, %k0
-; AVX512DQ-FAST-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-FAST-NEXT:    vpmovd2m %ymm0, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-SLOW-LABEL: mask_replication_factor2_vf4:
 ; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-SLOW-NEXT:    kmovq (%rdi), %k1
 ; AVX512BW-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512BW-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512BW-SLOW-NEXT:    kmovd %k0, %eax
-; AVX512BW-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512BW-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512BW-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512BW-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512BW-SLOW-NEXT:    vzeroupper
 ; AVX512BW-SLOW-NEXT:    retq
 ;
 ; AVX512BW-FAST-LABEL: mask_replication_factor2_vf4:
 ; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-FAST-NEXT:    kmovq (%rdi), %k1
 ; AVX512BW-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512BW-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
 ; AVX512BW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512BW-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512BW-FAST-NEXT:    kmovd %k0, %eax
-; AVX512BW-FAST-NEXT:    movb %al, (%rsi)
+; AVX512BW-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512BW-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512BW-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512BW-FAST-NEXT:    vzeroupper
 ; AVX512BW-FAST-NEXT:    retq
 ;
 ; AVX512VBMI-SLOW-LABEL: mask_replication_factor2_vf4:
 ; AVX512VBMI-SLOW:       # %bb.0:
-; AVX512VBMI-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512VBMI-SLOW-NEXT:    kmovq (%rdi), %k1
 ; AVX512VBMI-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512VBMI-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512VBMI-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512VBMI-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512VBMI-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512VBMI-SLOW-NEXT:    kmovd %k0, %eax
-; AVX512VBMI-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512VBMI-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512VBMI-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512VBMI-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512VBMI-SLOW-NEXT:    vzeroupper
 ; AVX512VBMI-SLOW-NEXT:    retq
 ;
 ; AVX512VBMI-FAST-LABEL: mask_replication_factor2_vf4:
 ; AVX512VBMI-FAST:       # %bb.0:
-; AVX512VBMI-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512VBMI-FAST-NEXT:    kmovq (%rdi), %k1
 ; AVX512VBMI-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512VBMI-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512VBMI-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
 ; AVX512VBMI-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512VBMI-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512VBMI-FAST-NEXT:    kmovd %k0, %eax
-; AVX512VBMI-FAST-NEXT:    movb %al, (%rsi)
+; AVX512VBMI-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512VBMI-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512VBMI-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512VBMI-FAST-NEXT:    vzeroupper
 ; AVX512VBMI-FAST-NEXT:    retq
-  %src.vec = load <4 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-  store <8 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison)
+  %data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor2_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf8:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
@@ -169,8 +179,9 @@ define void @mask_replication_factor2_vf8(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -180,29 +191,34 @@ define void @mask_replication_factor2_vf8(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512BW-NEXT:    kmovw %k0, (%rsi)
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <8 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-  store <16 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
+  store <16 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor2_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf16:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
@@ -210,14 +226,16 @@ define void @mask_replication_factor2_vf16(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -227,37 +245,47 @@ define void @mask_replication_factor2_vf16(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, (%rsi)
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <16 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-  store <32 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
+  store <32 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor2_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf32:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k2
-; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
-; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
-; AVX512F-ONLY-NEXT:    kmovw %k3, 4(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k2, 6(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k1, (%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k4
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
@@ -265,128 +293,174 @@ define void @mask_replication_factor2_vf32(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    kmovw 2(%rdi), %k1
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
-; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
-; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
-; AVX512DQ-NEXT:    kmovw %k3, 4(%rsi)
-; AVX512DQ-NEXT:    kmovw %k1, 6(%rsi)
-; AVX512DQ-NEXT:    kmovw %k2, (%rsi)
-; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32:
 ; AVX512BW-ONLY:       # %bb.0:
-; AVX512BW-ONLY-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
 ; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512BW-ONLY-NEXT:    vzeroupper
 ; AVX512BW-ONLY-NEXT:    retq
 ;
 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32:
 ; AVX512VBMI-ONLY:       # %bb.0:
-; AVX512VBMI-ONLY-NEXT:    kmovd (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vzeroupper
 ; AVX512VBMI-ONLY-NEXT:    retq
-  %src.vec = load <32 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
-  store <64 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+  %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
+  store <64 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor2_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k3
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k3
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k5
 ; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k4
-; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k2
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
-; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
-; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm3
-; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k3
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k5
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm3
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k4
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k6
-; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k5} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm3
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k5
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k7
-; AVX512F-ONLY-NEXT:    kmovw %k7, 12(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k2, 14(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k6, 8(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k4, 10(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k5, 4(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k3, 6(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k1, (%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k7} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k6} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512F-ONLY-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor2_vf64:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
-; AVX512DQ-NEXT:    kmovw 2(%rdi), %k3
-; AVX512DQ-NEXT:    kmovw 4(%rdi), %k4
-; AVX512DQ-NEXT:    kmovw 6(%rdi), %k2
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k5
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k3
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
-; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
 ; AVX512DQ-NEXT:    vpmovm2d %k3, %zmm0
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm3
 ; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k3
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k5
-; AVX512DQ-NEXT:    vpmovm2d %k4, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
+; AVX512DQ-NEXT:    vpmovm2d %k5, %zmm0
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm3
-; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k4
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k5
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k6
-; AVX512DQ-NEXT:    vpmovm2d %k2, %zmm0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k7
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k7
-; AVX512DQ-NEXT:    kmovw %k7, 12(%rsi)
-; AVX512DQ-NEXT:    kmovw %k2, 14(%rsi)
-; AVX512DQ-NEXT:    kmovw %k6, 8(%rsi)
-; AVX512DQ-NEXT:    kmovw %k4, 10(%rsi)
-; AVX512DQ-NEXT:    kmovw %k5, 4(%rsi)
-; AVX512DQ-NEXT:    kmovw %k3, 6(%rsi)
-; AVX512DQ-NEXT:    kmovw %k1, (%rsi)
-; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k7} {z}
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -394,15 +468,35 @@ define void @mask_replication_factor2_vf64(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512BW-ONLY:       # %bb.0:
 ; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,2,3,2,3]
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7]
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
-; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k0
-; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5,6,7,6,7]
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
-; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT:    kmovq %k1, 8(%rsi)
-; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512BW-ONLY-NEXT:    vzeroupper
 ; AVX512BW-ONLY-NEXT:    retq
 ;
@@ -410,4421 +504,1918 @@ define void @mask_replication_factor2_vf64(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512VBMI-ONLY:       # %bb.0:
 ; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k0
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT:    kmovq %k1, 8(%rsi)
-; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vzeroupper
 ; AVX512VBMI-ONLY-NEXT:    retq
-  %src.vec = load <64 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
-  store <128 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+  %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
+  store <128 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor3_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf2:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
-; AVX512F-ONLY-NEXT:    andb $1, %cl
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rcx), %edx
-; AVX512F-ONLY-NEXT:    orb %cl, %dl
-; AVX512F-ONLY-NEXT:    shlb $2, %cl
-; AVX512F-ONLY-NEXT:    orb %dl, %cl
-; AVX512F-ONLY-NEXT:    movl %eax, %edx
-; AVX512F-ONLY-NEXT:    andb $1, %dl
-; AVX512F-ONLY-NEXT:    leal (,%rdx,8), %edi
-; AVX512F-ONLY-NEXT:    orb %cl, %dil
-; AVX512F-ONLY-NEXT:    shlb $4, %dl
-; AVX512F-ONLY-NEXT:    orb %dil, %dl
-; AVX512F-ONLY-NEXT:    shlb $5, %al
-; AVX512F-ONLY-NEXT:    orb %dl, %al
-; AVX512F-ONLY-NEXT:    andb $63, %al
-; AVX512F-ONLY-NEXT:    movb %al, (%rsi)
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512F-ONLY-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u>
+; AVX512F-ONLY-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512F-ONLY-NEXT:    movb $63, %al
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vptestmd %ymm0, %ymm0, %k1 {%k1}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-ONLY-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor3_vf2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k0
-; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    kmovw %k0, %ecx
-; AVX512DQ-NEXT:    andb $1, %cl
-; AVX512DQ-NEXT:    leal (%rcx,%rcx), %edx
-; AVX512DQ-NEXT:    orb %cl, %dl
-; AVX512DQ-NEXT:    shlb $2, %cl
-; AVX512DQ-NEXT:    orb %dl, %cl
-; AVX512DQ-NEXT:    movl %eax, %edx
-; AVX512DQ-NEXT:    andb $1, %dl
-; AVX512DQ-NEXT:    leal (,%rdx,8), %edi
-; AVX512DQ-NEXT:    orb %cl, %dil
-; AVX512DQ-NEXT:    shlb $4, %dl
-; AVX512DQ-NEXT:    orb %dil, %dl
-; AVX512DQ-NEXT:    shlb $5, %al
-; AVX512DQ-NEXT:    orb %dl, %al
-; AVX512DQ-NEXT:    andb $63, %al
-; AVX512DQ-NEXT:    movb %al, (%rsi)
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u>
+; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    movb $63, %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor3_vf2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    andb $1, %cl
-; AVX512BW-NEXT:    leal (%rcx,%rcx), %edx
-; AVX512BW-NEXT:    orb %cl, %dl
-; AVX512BW-NEXT:    shlb $2, %cl
-; AVX512BW-NEXT:    orb %dl, %cl
-; AVX512BW-NEXT:    movl %eax, %edx
-; AVX512BW-NEXT:    andb $1, %dl
-; AVX512BW-NEXT:    leal (,%rdx,8), %edi
-; AVX512BW-NEXT:    orb %cl, %dil
-; AVX512BW-NEXT:    shlb $4, %dl
-; AVX512BW-NEXT:    orb %dil, %dl
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    orb %dl, %al
-; AVX512BW-NEXT:    andb $63, %al
-; AVX512BW-NEXT:    movb %al, (%rsi)
+; AVX512BW-NEXT:    kmovq (%rdi), %k1
+; AVX512BW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u>
+; AVX512BW-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512BW-NEXT:    movb $63, %al
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vptestmd %ymm0, %ymm0, %k1 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm1, 16(%rdx)
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <2 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
-  store <6 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
+  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
+  %data = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr %in.vec, i32 64, <6 x i1> %tgt.mask, <6 x i32> poison)
+  %data.padded = shufflevector <6 x i32> %data, <6 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <6 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor3_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf4:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k2
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512F-ONLY-NEXT:    movl %eax, %edx
-; AVX512F-ONLY-NEXT:    shll $4, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $5, %eax
-; AVX512F-ONLY-NEXT:    orl %edx, %eax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $6, %edx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edi
-; AVX512F-ONLY-NEXT:    shll $7, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $8, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    kmovw %k0, %edx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $9, %r8d
-; AVX512F-ONLY-NEXT:    orl %ecx, %r8d
-; AVX512F-ONLY-NEXT:    shll $10, %edi
-; AVX512F-ONLY-NEXT:    orl %r8d, %edi
-; AVX512F-ONLY-NEXT:    shll $11, %edx
-; AVX512F-ONLY-NEXT:    orl %edi, %edx
-; AVX512F-ONLY-NEXT:    orl %eax, %edx
-; AVX512F-ONLY-NEXT:    andl $4095, %edx # imm = 0xFFF
-; AVX512F-ONLY-NEXT:    movw %dx, (%rsi)
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u>
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-ONLY-NEXT:    movw $4095, %ax # imm = 0xFFF
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor3_vf4:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k2
-; AVX512DQ-NEXT:    kshiftrb $3, %k2, %k0
-; AVX512DQ-NEXT:    kshiftrb $2, %k2, %k1
-; AVX512DQ-NEXT:    kshiftrb $1, %k2, %k3
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512DQ-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512DQ-NEXT:    movl %eax, %edx
-; AVX512DQ-NEXT:    shll $4, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    shll $5, %eax
-; AVX512DQ-NEXT:    orl %edx, %eax
-; AVX512DQ-NEXT:    kmovw %k1, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    shll $6, %edx
-; AVX512DQ-NEXT:    movl %ecx, %edi
-; AVX512DQ-NEXT:    shll $7, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    shll $8, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    kmovw %k0, %edx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movl %edi, %r8d
-; AVX512DQ-NEXT:    shll $9, %r8d
-; AVX512DQ-NEXT:    orl %ecx, %r8d
-; AVX512DQ-NEXT:    shll $10, %edi
-; AVX512DQ-NEXT:    orl %r8d, %edi
-; AVX512DQ-NEXT:    shll $11, %edx
-; AVX512DQ-NEXT:    orl %edi, %edx
-; AVX512DQ-NEXT:    orl %eax, %edx
-; AVX512DQ-NEXT:    andl $4095, %edx # imm = 0xFFF
-; AVX512DQ-NEXT:    movw %dx, (%rsi)
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u>
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    movw $4095, %ax # imm = 0xFFF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor3_vf4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k2
-; AVX512BW-NEXT:    kshiftrw $3, %k2, %k0
-; AVX512BW-NEXT:    kshiftrw $2, %k2, %k1
-; AVX512BW-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512BW-NEXT:    kmovd %k2, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512BW-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512BW-NEXT:    kmovd %k3, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512BW-NEXT:    movl %eax, %edx
-; AVX512BW-NEXT:    shll $4, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    shll $5, %eax
-; AVX512BW-NEXT:    orl %edx, %eax
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    shll $6, %edx
-; AVX512BW-NEXT:    movl %ecx, %edi
-; AVX512BW-NEXT:    shll $7, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    shll $8, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    kmovd %k0, %edx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movl %edi, %r8d
-; AVX512BW-NEXT:    shll $9, %r8d
-; AVX512BW-NEXT:    orl %ecx, %r8d
-; AVX512BW-NEXT:    shll $10, %edi
-; AVX512BW-NEXT:    orl %r8d, %edi
-; AVX512BW-NEXT:    shll $11, %edx
-; AVX512BW-NEXT:    orl %edi, %edx
-; AVX512BW-NEXT:    orl %eax, %edx
-; AVX512BW-NEXT:    andl $4095, %edx # imm = 0xFFF
-; AVX512BW-NEXT:    movw %dx, (%rsi)
+; AVX512BW-NEXT:    kmovq (%rdi), %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u>
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512BW-NEXT:    movw $4095, %ax # imm = 0xFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <4 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-  store <12 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+  %data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison)
+  %data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <12 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor3_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf8:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k6
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k6, %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k6, %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k6, %k2
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k6, %k3
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k6, %k4
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k6, %k5
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k6, %k7
-; AVX512F-ONLY-NEXT:    kmovw %k6, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512F-ONLY-NEXT:    kmovw %k7, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512F-ONLY-NEXT:    movl %eax, %edx
-; AVX512F-ONLY-NEXT:    shll $4, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $5, %eax
-; AVX512F-ONLY-NEXT:    orl %edx, %eax
-; AVX512F-ONLY-NEXT:    kmovw %k5, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $6, %edx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edi
-; AVX512F-ONLY-NEXT:    shll $7, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $8, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    kmovw %k4, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $9, %edi
-; AVX512F-ONLY-NEXT:    orl %ecx, %edi
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $10, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    shll $11, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    kmovw %k3, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movl %edi, %ecx
-; AVX512F-ONLY-NEXT:    shll $12, %ecx
-; AVX512F-ONLY-NEXT:    orl %edx, %ecx
-; AVX512F-ONLY-NEXT:    movl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $13, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $14, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $15, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    orl %eax, %ecx
-; AVX512F-ONLY-NEXT:    movw %cx, (%rsi)
-; AVX512F-ONLY-NEXT:    movl %edx, %eax
-; AVX512F-ONLY-NEXT:    shll $16, %eax
-; AVX512F-ONLY-NEXT:    shll $17, %edx
-; AVX512F-ONLY-NEXT:    orl %eax, %edx
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, %edi
-; AVX512F-ONLY-NEXT:    shll $18, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    movl %eax, %edx
-; AVX512F-ONLY-NEXT:    shll $19, %edx
-; AVX512F-ONLY-NEXT:    orl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $20, %eax
-; AVX512F-ONLY-NEXT:    orl %edx, %eax
-; AVX512F-ONLY-NEXT:    kmovw %k0, %edx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $21, %r8d
-; AVX512F-ONLY-NEXT:    orl %eax, %r8d
-; AVX512F-ONLY-NEXT:    shll $22, %edi
-; AVX512F-ONLY-NEXT:    orl %r8d, %edi
-; AVX512F-ONLY-NEXT:    shll $23, %edx
-; AVX512F-ONLY-NEXT:    orl %edi, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shrl $16, %edx
-; AVX512F-ONLY-NEXT:    movb %dl, 2(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512F-ONLY-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa %ymm1, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor3_vf8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k6
-; AVX512DQ-NEXT:    kshiftrb $7, %k6, %k0
-; AVX512DQ-NEXT:    kshiftrb $6, %k6, %k1
-; AVX512DQ-NEXT:    kshiftrb $5, %k6, %k2
-; AVX512DQ-NEXT:    kshiftrb $4, %k6, %k3
-; AVX512DQ-NEXT:    kshiftrb $3, %k6, %k4
-; AVX512DQ-NEXT:    kshiftrb $2, %k6, %k5
-; AVX512DQ-NEXT:    kshiftrb $1, %k6, %k7
-; AVX512DQ-NEXT:    kmovw %k6, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512DQ-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512DQ-NEXT:    kmovw %k7, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512DQ-NEXT:    movl %eax, %edx
-; AVX512DQ-NEXT:    shll $4, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    shll $5, %eax
-; AVX512DQ-NEXT:    orl %edx, %eax
-; AVX512DQ-NEXT:    kmovw %k5, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    shll $6, %edx
-; AVX512DQ-NEXT:    movl %ecx, %edi
-; AVX512DQ-NEXT:    shll $7, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    shll $8, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    kmovw %k4, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $9, %edi
-; AVX512DQ-NEXT:    orl %ecx, %edi
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $10, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    shll $11, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    kmovw %k3, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movl %edi, %ecx
-; AVX512DQ-NEXT:    shll $12, %ecx
-; AVX512DQ-NEXT:    orl %edx, %ecx
-; AVX512DQ-NEXT:    movl %edi, %edx
-; AVX512DQ-NEXT:    shll $13, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    shll $14, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    kmovw %k2, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $15, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    movw %cx, (%rsi)
-; AVX512DQ-NEXT:    movl %edx, %eax
-; AVX512DQ-NEXT:    shll $16, %eax
-; AVX512DQ-NEXT:    shll $17, %edx
-; AVX512DQ-NEXT:    orl %eax, %edx
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movl %eax, %edi
-; AVX512DQ-NEXT:    shll $18, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    movl %eax, %edx
-; AVX512DQ-NEXT:    shll $19, %edx
-; AVX512DQ-NEXT:    orl %edi, %edx
-; AVX512DQ-NEXT:    shll $20, %eax
-; AVX512DQ-NEXT:    orl %edx, %eax
-; AVX512DQ-NEXT:    kmovw %k0, %edx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movl %edi, %r8d
-; AVX512DQ-NEXT:    shll $21, %r8d
-; AVX512DQ-NEXT:    orl %eax, %r8d
-; AVX512DQ-NEXT:    shll $22, %edi
-; AVX512DQ-NEXT:    orl %r8d, %edi
-; AVX512DQ-NEXT:    shll $23, %edx
-; AVX512DQ-NEXT:    orl %edi, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    shrl $16, %edx
-; AVX512DQ-NEXT:    movb %dl, 2(%rsi)
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm1
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7]
+; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpmovd2m %ymm0, %k2
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa %ymm1, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor3_vf8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k6
-; AVX512BW-NEXT:    kshiftrw $7, %k6, %k0
-; AVX512BW-NEXT:    kshiftrw $6, %k6, %k1
-; AVX512BW-NEXT:    kshiftrw $5, %k6, %k2
-; AVX512BW-NEXT:    kshiftrw $4, %k6, %k3
-; AVX512BW-NEXT:    kshiftrw $3, %k6, %k4
-; AVX512BW-NEXT:    kshiftrw $2, %k6, %k5
-; AVX512BW-NEXT:    kshiftrw $1, %k6, %k7
-; AVX512BW-NEXT:    kmovd %k6, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512BW-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512BW-NEXT:    kmovd %k7, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512BW-NEXT:    movl %eax, %edx
-; AVX512BW-NEXT:    shll $4, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    shll $5, %eax
-; AVX512BW-NEXT:    orl %edx, %eax
-; AVX512BW-NEXT:    kmovd %k5, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    shll $6, %edx
-; AVX512BW-NEXT:    movl %ecx, %edi
-; AVX512BW-NEXT:    shll $7, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    shll $8, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    kmovd %k4, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $9, %edi
-; AVX512BW-NEXT:    orl %ecx, %edi
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $10, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    shll $11, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    kmovd %k3, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movl %edi, %ecx
-; AVX512BW-NEXT:    shll $12, %ecx
-; AVX512BW-NEXT:    orl %edx, %ecx
-; AVX512BW-NEXT:    movl %edi, %edx
-; AVX512BW-NEXT:    shll $13, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    shll $14, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    kmovd %k2, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $15, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    orl %eax, %ecx
-; AVX512BW-NEXT:    movw %cx, (%rsi)
-; AVX512BW-NEXT:    movl %edx, %eax
-; AVX512BW-NEXT:    shll $16, %eax
-; AVX512BW-NEXT:    shll $17, %edx
-; AVX512BW-NEXT:    orl %eax, %edx
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movl %eax, %edi
-; AVX512BW-NEXT:    shll $18, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    movl %eax, %edx
-; AVX512BW-NEXT:    shll $19, %edx
-; AVX512BW-NEXT:    orl %edi, %edx
-; AVX512BW-NEXT:    shll $20, %eax
-; AVX512BW-NEXT:    orl %edx, %eax
-; AVX512BW-NEXT:    kmovd %k0, %edx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movl %edi, %r8d
-; AVX512BW-NEXT:    shll $21, %r8d
-; AVX512BW-NEXT:    orl %eax, %r8d
-; AVX512BW-NEXT:    shll $22, %edi
-; AVX512BW-NEXT:    orl %r8d, %edi
-; AVX512BW-NEXT:    shll $23, %edx
-; AVX512BW-NEXT:    orl %edi, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    shrl $16, %edx
-; AVX512BW-NEXT:    movb %dl, 2(%rsi)
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    movl $16777215, %eax # imm = 0xFFFFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %ymm0, 64(%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <8 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-  store <24 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+  %data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison)
+  %data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <24 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor3_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf16:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k4
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k4, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k4, %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k4, %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k4, %k2
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k4, %k3
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %edi
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r15,2), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%r15,4), %r15
-; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rdi,8), %r15
-; AVX512F-ONLY-NEXT:    movq %rdi, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $5, %rdi
-; AVX512F-ONLY-NEXT:    orq %r12, %rdi
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %r15
-; AVX512F-ONLY-NEXT:    shlq $6, %r15
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $8, %r14
-; AVX512F-ONLY-NEXT:    orq %r12, %r14
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $9, %r12
-; AVX512F-ONLY-NEXT:    orq %r14, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r14
-; AVX512F-ONLY-NEXT:    shlq $10, %r14
-; AVX512F-ONLY-NEXT:    orq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $11, %r15
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    movzbl %bl, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %r14
-; AVX512F-ONLY-NEXT:    shlq $12, %r14
-; AVX512F-ONLY-NEXT:    orq %r15, %r14
-; AVX512F-ONLY-NEXT:    movq %rbx, %r15
-; AVX512F-ONLY-NEXT:    shlq $13, %r15
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    shlq $14, %rbx
-; AVX512F-ONLY-NEXT:    orq %r15, %rbx
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %r11d
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    movq %r11, %r14
-; AVX512F-ONLY-NEXT:    shlq $15, %r14
-; AVX512F-ONLY-NEXT:    orq %rbx, %r14
-; AVX512F-ONLY-NEXT:    movq %r11, %rbx
-; AVX512F-ONLY-NEXT:    shlq $16, %rbx
-; AVX512F-ONLY-NEXT:    orq %r14, %rbx
-; AVX512F-ONLY-NEXT:    shlq $17, %r11
-; AVX512F-ONLY-NEXT:    orq %rbx, %r11
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rbx
-; AVX512F-ONLY-NEXT:    shlq $18, %rbx
-; AVX512F-ONLY-NEXT:    orq %r11, %rbx
-; AVX512F-ONLY-NEXT:    movq %r10, %r11
-; AVX512F-ONLY-NEXT:    shlq $19, %r11
-; AVX512F-ONLY-NEXT:    orq %rbx, %r11
-; AVX512F-ONLY-NEXT:    shlq $20, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $21, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $22, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    shlq $23, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %r10
-; AVX512F-ONLY-NEXT:    shlq $24, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $25, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    shlq $26, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    movzbl %dl, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %rdx
-; AVX512F-ONLY-NEXT:    shlq $27, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    movq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $28, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $29, %r9
-; AVX512F-ONLY-NEXT:    orq %r8, %r9
-; AVX512F-ONLY-NEXT:    movzbl %cl, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $30, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $31, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movl %edx, (%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    shlq $32, %rcx
-; AVX512F-ONLY-NEXT:    kmovw %k3, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $33, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    shlq $34, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    shlq $35, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $36, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $37, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    shlq $38, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $39, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    shlq $40, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    shlq $41, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $42, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $43, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    shlq $44, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    movzbl %al, %edi
-; AVX512F-ONLY-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $45, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shrq $32, %rdi
-; AVX512F-ONLY-NEXT:    movw %di, 4(%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor3_vf16:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
-; AVX512DQ-NEXT:    kmovw (%rdi), %k4
-; AVX512DQ-NEXT:    kshiftrw $15, %k4, %k0
-; AVX512DQ-NEXT:    kmovw %k0, %eax
-; AVX512DQ-NEXT:    kshiftrw $14, %k4, %k0
-; AVX512DQ-NEXT:    kshiftrw $13, %k4, %k1
-; AVX512DQ-NEXT:    kshiftrw $12, %k4, %k2
-; AVX512DQ-NEXT:    kshiftrw $11, %k4, %k3
-; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %ecx
-; AVX512DQ-NEXT:    kshiftrw $9, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %edx
-; AVX512DQ-NEXT:    kshiftrw $8, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %r8d
-; AVX512DQ-NEXT:    kshiftrw $7, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %r9d
-; AVX512DQ-NEXT:    kshiftrw $6, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %r10d
-; AVX512DQ-NEXT:    kshiftrw $5, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %r11d
-; AVX512DQ-NEXT:    kshiftrw $4, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %ebx
-; AVX512DQ-NEXT:    kshiftrw $3, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %ebp
-; AVX512DQ-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %r14d
-; AVX512DQ-NEXT:    kshiftrw $1, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %edi
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    movzbl %r15b, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    leaq (%r15,%r15,2), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%r15,4), %r15
-; AVX512DQ-NEXT:    movzbl %dil, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    leaq (%r15,%rdi,8), %r15
-; AVX512DQ-NEXT:    movq %rdi, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    shlq $5, %rdi
-; AVX512DQ-NEXT:    orq %r12, %rdi
-; AVX512DQ-NEXT:    movzbl %r14b, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %r15
-; AVX512DQ-NEXT:    shlq $6, %r15
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $7, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    shlq $8, %r14
-; AVX512DQ-NEXT:    orq %r12, %r14
-; AVX512DQ-NEXT:    movzbl %bpl, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $9, %r12
-; AVX512DQ-NEXT:    orq %r14, %r12
-; AVX512DQ-NEXT:    movq %r15, %r14
-; AVX512DQ-NEXT:    shlq $10, %r14
-; AVX512DQ-NEXT:    orq %r12, %r14
-; AVX512DQ-NEXT:    shlq $11, %r15
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    movzbl %bl, %ebx
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %r14
-; AVX512DQ-NEXT:    shlq $12, %r14
-; AVX512DQ-NEXT:    orq %r15, %r14
-; AVX512DQ-NEXT:    movq %rbx, %r15
-; AVX512DQ-NEXT:    shlq $13, %r15
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    shlq $14, %rbx
-; AVX512DQ-NEXT:    orq %r15, %rbx
-; AVX512DQ-NEXT:    movzbl %r11b, %r11d
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    movq %r11, %r14
-; AVX512DQ-NEXT:    shlq $15, %r14
-; AVX512DQ-NEXT:    orq %rbx, %r14
-; AVX512DQ-NEXT:    movq %r11, %rbx
-; AVX512DQ-NEXT:    shlq $16, %rbx
-; AVX512DQ-NEXT:    orq %r14, %rbx
-; AVX512DQ-NEXT:    shlq $17, %r11
-; AVX512DQ-NEXT:    orq %rbx, %r11
-; AVX512DQ-NEXT:    movzbl %r10b, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rbx
-; AVX512DQ-NEXT:    shlq $18, %rbx
-; AVX512DQ-NEXT:    orq %r11, %rbx
-; AVX512DQ-NEXT:    movq %r10, %r11
-; AVX512DQ-NEXT:    shlq $19, %r11
-; AVX512DQ-NEXT:    orq %rbx, %r11
-; AVX512DQ-NEXT:    shlq $20, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    movzbl %r9b, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $21, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $22, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    shlq $23, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movzbl %r8b, %r8d
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %r10
-; AVX512DQ-NEXT:    shlq $24, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $25, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    shlq $26, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    movzbl %dl, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %rdx
-; AVX512DQ-NEXT:    shlq $27, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    movq %r9, %r8
-; AVX512DQ-NEXT:    shlq $28, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $29, %r9
-; AVX512DQ-NEXT:    orq %r8, %r9
-; AVX512DQ-NEXT:    movzbl %cl, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $30, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    shlq $31, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movl %edx, (%rsi)
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    shlq $32, %rcx
-; AVX512DQ-NEXT:    kmovw %k3, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $33, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rcx
-; AVX512DQ-NEXT:    shlq $34, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    shlq $35, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    kmovw %k2, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $36, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $37, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    shlq $38, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    kmovw %k1, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $39, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rcx
-; AVX512DQ-NEXT:    shlq $40, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    shlq $41, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    kmovw %k0, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $42, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $43, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    shlq $44, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    movzbl %al, %edi
-; AVX512DQ-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $45, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    shlq $47, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    shrq $32, %rdi
-; AVX512DQ-NEXT:    movw %di, 4(%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-LABEL: mask_replication_factor3_vf16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovw (%rdi), %k4
-; AVX512BW-NEXT:    kshiftrw $15, %k4, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    kshiftrw $14, %k4, %k0
-; AVX512BW-NEXT:    kshiftrw $13, %k4, %k1
-; AVX512BW-NEXT:    kshiftrw $12, %k4, %k2
-; AVX512BW-NEXT:    kshiftrw $11, %k4, %k3
-; AVX512BW-NEXT:    kshiftrw $10, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %ecx
-; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %edx
-; AVX512BW-NEXT:    kshiftrw $8, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %r8d
-; AVX512BW-NEXT:    kshiftrw $7, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %r9d
-; AVX512BW-NEXT:    kshiftrw $6, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %r10d
-; AVX512BW-NEXT:    kshiftrw $5, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %r11d
-; AVX512BW-NEXT:    kshiftrw $4, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %ebx
-; AVX512BW-NEXT:    kshiftrw $3, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %ebp
-; AVX512BW-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %r14d
-; AVX512BW-NEXT:    kshiftrw $1, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %edi
-; AVX512BW-NEXT:    kmovd %k4, %r15d
-; AVX512BW-NEXT:    movzbl %r15b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    leaq (%r15,%r15,2), %r12
-; AVX512BW-NEXT:    leaq (%r12,%r15,4), %r15
-; AVX512BW-NEXT:    movzbl %dil, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    leaq (%r15,%rdi,8), %r15
-; AVX512BW-NEXT:    movq %rdi, %r12
-; AVX512BW-NEXT:    shlq $4, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    shlq $5, %rdi
-; AVX512BW-NEXT:    orq %r12, %rdi
-; AVX512BW-NEXT:    movzbl %r14b, %r14d
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %r15
-; AVX512BW-NEXT:    shlq $6, %r15
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    shlq $8, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    movzbl %bpl, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    movq %r15, %r14
-; AVX512BW-NEXT:    shlq $10, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    shlq $11, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    movzbl %bl, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %r14
-; AVX512BW-NEXT:    shlq $12, %r14
-; AVX512BW-NEXT:    orq %r15, %r14
-; AVX512BW-NEXT:    movq %rbx, %r15
-; AVX512BW-NEXT:    shlq $13, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    shlq $14, %rbx
-; AVX512BW-NEXT:    orq %r15, %rbx
-; AVX512BW-NEXT:    movzbl %r11b, %r11d
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %r14
-; AVX512BW-NEXT:    shlq $15, %r14
-; AVX512BW-NEXT:    orq %rbx, %r14
-; AVX512BW-NEXT:    movq %r11, %rbx
-; AVX512BW-NEXT:    shlq $16, %rbx
-; AVX512BW-NEXT:    orq %r14, %rbx
-; AVX512BW-NEXT:    shlq $17, %r11
-; AVX512BW-NEXT:    orq %rbx, %r11
-; AVX512BW-NEXT:    movzbl %r10b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %rbx
-; AVX512BW-NEXT:    shlq $18, %rbx
-; AVX512BW-NEXT:    orq %r11, %rbx
-; AVX512BW-NEXT:    movq %r10, %r11
-; AVX512BW-NEXT:    shlq $19, %r11
-; AVX512BW-NEXT:    orq %rbx, %r11
-; AVX512BW-NEXT:    shlq $20, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    movzbl %r9b, %r9d
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $21, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $22, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    shlq $23, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movzbl %r8b, %r8d
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %r10
-; AVX512BW-NEXT:    shlq $24, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $25, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    shlq $26, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    movzbl %dl, %r9d
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %rdx
-; AVX512BW-NEXT:    shlq $27, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    movq %r9, %r8
-; AVX512BW-NEXT:    shlq $28, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    shlq $29, %r9
-; AVX512BW-NEXT:    orq %r8, %r9
-; AVX512BW-NEXT:    movzbl %cl, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    shlq $30, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    shlq $31, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movl %edx, (%rsi)
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    shlq $32, %rcx
-; AVX512BW-NEXT:    kmovd %k3, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $33, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    movq %rdi, %rcx
-; AVX512BW-NEXT:    shlq $34, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    shlq $35, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    kmovd %k2, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %r8
-; AVX512BW-NEXT:    shlq $36, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $37, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    shlq $38, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $39, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    movq %rdi, %rcx
-; AVX512BW-NEXT:    shlq $40, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    shlq $41, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %r8
-; AVX512BW-NEXT:    shlq $42, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $43, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    shlq $44, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    movzbl %al, %edi
-; AVX512BW-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $45, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    shlq $47, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    shrq $32, %rdi
-; AVX512BW-NEXT:    movw %di, 4(%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    retq
-  %src.vec = load <16 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
-  store <48 x i1> %tgt.mask, ptr %out.vec, align 64
+; AVX512BW-ONLY-LABEL: mask_replication_factor3_vf16:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,42,43,43,43,44,44,44,45,45,45,46,46,46,47,47,47,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
+; AVX512BW-ONLY-NEXT:    vpermd %zmm1, %zmm2, %zmm1
+; AVX512BW-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor3_vf16:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
+; AVX512VBMI-ONLY-NEXT:    vpermd %zmm1, %zmm2, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+  %data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison)
+  store <48 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor3_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf32:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r13
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k2
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movl %eax, %ebp
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    leal (%rbp,%rbp,2), %ebp
-; AVX512F-ONLY-NEXT:    leal (%rbp,%r14,4), %ebp
-; AVX512F-ONLY-NEXT:    leal (%rbp,%r14,8), %ebp
-; AVX512F-ONLY-NEXT:    shll $4, %r14d
-; AVX512F-ONLY-NEXT:    orl %ebp, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movl %r15d, %ebp
-; AVX512F-ONLY-NEXT:    shll $5, %ebp
-; AVX512F-ONLY-NEXT:    orl %r14d, %ebp
-; AVX512F-ONLY-NEXT:    movl %r15d, %r14d
-; AVX512F-ONLY-NEXT:    shll $6, %r14d
-; AVX512F-ONLY-NEXT:    shll $7, %r15d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movl %r12d, %r14d
-; AVX512F-ONLY-NEXT:    shll $8, %r14d
-; AVX512F-ONLY-NEXT:    orl %r15d, %r14d
-; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
-; AVX512F-ONLY-NEXT:    shll $9, %r15d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r15d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k2
-; AVX512F-ONLY-NEXT:    shll $10, %r12d
-; AVX512F-ONLY-NEXT:    orl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movl %r13d, %r14d
-; AVX512F-ONLY-NEXT:    shll $11, %r14d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
-; AVX512F-ONLY-NEXT:    movl %r13d, %r15d
-; AVX512F-ONLY-NEXT:    shll $12, %r15d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r15d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k2
-; AVX512F-ONLY-NEXT:    shll $13, %r13d
-; AVX512F-ONLY-NEXT:    orl %r15d, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, %r14d
-; AVX512F-ONLY-NEXT:    shll $14, %r14d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
-; AVX512F-ONLY-NEXT:    movl %eax, %r15d
-; AVX512F-ONLY-NEXT:    shll $15, %r15d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r15d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k2
-; AVX512F-ONLY-NEXT:    shll $16, %eax
-; AVX512F-ONLY-NEXT:    orl %r15d, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
-; AVX512F-ONLY-NEXT:    shll $17, %r15d
-; AVX512F-ONLY-NEXT:    orl %eax, %r15d
-; AVX512F-ONLY-NEXT:    movl %r12d, %eax
-; AVX512F-ONLY-NEXT:    shll $18, %eax
-; AVX512F-ONLY-NEXT:    orl %r15d, %eax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k2
-; AVX512F-ONLY-NEXT:    shll $19, %r12d
-; AVX512F-ONLY-NEXT:    orl %eax, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movl %r14d, %eax
-; AVX512F-ONLY-NEXT:    shll $20, %eax
-; AVX512F-ONLY-NEXT:    orl %r12d, %eax
-; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
-; AVX512F-ONLY-NEXT:    shll $21, %r12d
-; AVX512F-ONLY-NEXT:    orl %eax, %r12d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k2
-; AVX512F-ONLY-NEXT:    shll $22, %r14d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    shll $23, %r12d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r12d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
-; AVX512F-ONLY-NEXT:    shll $24, %r13d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512F-ONLY-NEXT:    shll $25, %r15d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, %r12d
-; AVX512F-ONLY-NEXT:    shll $26, %r12d
-; AVX512F-ONLY-NEXT:    orl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    movl %eax, %r15d
-; AVX512F-ONLY-NEXT:    shll $27, %r15d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r15d
-; AVX512F-ONLY-NEXT:    shll $28, %eax
-; AVX512F-ONLY-NEXT:    orl %r15d, %eax
-; AVX512F-ONLY-NEXT:    movl %r14d, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movl %r13d, %r12d
-; AVX512F-ONLY-NEXT:    shll $29, %r12d
-; AVX512F-ONLY-NEXT:    orl %eax, %r12d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512F-ONLY-NEXT:    shll $30, %r13d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k2
-; AVX512F-ONLY-NEXT:    shll $31, %r14d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k2
-; AVX512F-ONLY-NEXT:    orl %ebp, %r14d
-; AVX512F-ONLY-NEXT:    movl %r14d, 8(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    movzbl %al, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r14
-; AVX512F-ONLY-NEXT:    leaq (%r14,%rax,4), %rax
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r14,8), %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $5, %r14
-; AVX512F-ONLY-NEXT:    orq %r12, %r14
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $6, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %bl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rbx
-; AVX512F-ONLY-NEXT:    shlq $9, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $11, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r11
-; AVX512F-ONLY-NEXT:    shlq $12, %r11
-; AVX512F-ONLY-NEXT:    orq %r12, %r11
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $13, %r12
-; AVX512F-ONLY-NEXT:    orq %r11, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %ebp
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movq %rbp, %r10
-; AVX512F-ONLY-NEXT:    shlq $15, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $17, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %rbp, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $19, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $20, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $21, %r8
-; AVX512F-ONLY-NEXT:    orq %r12, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $22, %r12
-; AVX512F-ONLY-NEXT:    orq %r8, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %dil, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rdi
-; AVX512F-ONLY-NEXT:    shlq $24, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $25, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
-; AVX512F-ONLY-NEXT:    shlq $26, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %dl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $27, %rdx
-; AVX512F-ONLY-NEXT:    orq %r12, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $28, %r12
-; AVX512F-ONLY-NEXT:    orq %rdx, %r12
-; AVX512F-ONLY-NEXT:    shlq $29, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %cl, %ebp
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movq %rbp, %rdx
-; AVX512F-ONLY-NEXT:    shlq $30, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k0
-; AVX512F-ONLY-NEXT:    shlq $31, %rbp
-; AVX512F-ONLY-NEXT:    orq %rdx, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k0, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    shlq $32, %rcx
-; AVX512F-ONLY-NEXT:    orq %rbp, %rcx
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $34, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $35, %r13
-; AVX512F-ONLY-NEXT:    orq %rcx, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $37, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $38, %r15
-; AVX512F-ONLY-NEXT:    orq %rcx, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $39, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $40, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $41, %rbx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rbx
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %rcx
-; AVX512F-ONLY-NEXT:    shlq $43, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $44, %r11
-; AVX512F-ONLY-NEXT:    orq %rcx, %r11
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $45, %rax
-; AVX512F-ONLY-NEXT:    orq %r11, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %rcx
-; AVX512F-ONLY-NEXT:    shlq $46, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $47, %r10
-; AVX512F-ONLY-NEXT:    orq %rcx, %r10
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $49, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $50, %r12
-; AVX512F-ONLY-NEXT:    orq %rcx, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $51, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r9, %rcx
-; AVX512F-ONLY-NEXT:    shlq $52, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $53, %r9
-; AVX512F-ONLY-NEXT:    orq %rcx, %r9
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %rcx
-; AVX512F-ONLY-NEXT:    shlq $55, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $56, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    shlq $58, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $59, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $61, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $62, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512F-ONLY-NEXT:    shlq $63, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %rax, (%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r13
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k2
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k2
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k2}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k3
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm3
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k4
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k6
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor3_vf32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r13
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k1
 ; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %ecx
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %edx
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %edi
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r8d
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r9d
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r10d
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r11d
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %ebx
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r13d
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k2
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movl %eax, %ebp
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    leal (%rbp,%rbp,2), %ebp
-; AVX512DQ-NEXT:    leal (%rbp,%r14,4), %ebp
-; AVX512DQ-NEXT:    leal (%rbp,%r14,8), %ebp
-; AVX512DQ-NEXT:    shll $4, %r14d
-; AVX512DQ-NEXT:    orl %ebp, %r14d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movl %r15d, %ebp
-; AVX512DQ-NEXT:    shll $5, %ebp
-; AVX512DQ-NEXT:    orl %r14d, %ebp
-; AVX512DQ-NEXT:    movl %r15d, %r14d
-; AVX512DQ-NEXT:    shll $6, %r14d
-; AVX512DQ-NEXT:    shll $7, %r15d
-; AVX512DQ-NEXT:    orl %r14d, %r15d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movl %r12d, %r14d
-; AVX512DQ-NEXT:    shll $8, %r14d
-; AVX512DQ-NEXT:    orl %r15d, %r14d
-; AVX512DQ-NEXT:    movl %r12d, %r15d
-; AVX512DQ-NEXT:    shll $9, %r15d
-; AVX512DQ-NEXT:    orl %r14d, %r15d
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k2
-; AVX512DQ-NEXT:    shll $10, %r12d
-; AVX512DQ-NEXT:    orl %r15d, %r12d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movl %r13d, %r14d
-; AVX512DQ-NEXT:    shll $11, %r14d
-; AVX512DQ-NEXT:    orl %r12d, %r14d
-; AVX512DQ-NEXT:    movl %r13d, %r15d
-; AVX512DQ-NEXT:    shll $12, %r15d
-; AVX512DQ-NEXT:    orl %r14d, %r15d
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k2
-; AVX512DQ-NEXT:    shll $13, %r13d
-; AVX512DQ-NEXT:    orl %r15d, %r13d
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movl %eax, %r14d
-; AVX512DQ-NEXT:    shll $14, %r14d
-; AVX512DQ-NEXT:    orl %r13d, %r14d
-; AVX512DQ-NEXT:    movl %eax, %r15d
-; AVX512DQ-NEXT:    shll $15, %r15d
-; AVX512DQ-NEXT:    orl %r14d, %r15d
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k2
-; AVX512DQ-NEXT:    shll $16, %eax
-; AVX512DQ-NEXT:    orl %r15d, %eax
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movl %r12d, %r15d
-; AVX512DQ-NEXT:    shll $17, %r15d
-; AVX512DQ-NEXT:    orl %eax, %r15d
-; AVX512DQ-NEXT:    movl %r12d, %eax
-; AVX512DQ-NEXT:    shll $18, %eax
-; AVX512DQ-NEXT:    orl %r15d, %eax
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k2
-; AVX512DQ-NEXT:    shll $19, %r12d
-; AVX512DQ-NEXT:    orl %eax, %r12d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movl %r14d, %eax
-; AVX512DQ-NEXT:    shll $20, %eax
-; AVX512DQ-NEXT:    orl %r12d, %eax
-; AVX512DQ-NEXT:    movl %r14d, %r12d
-; AVX512DQ-NEXT:    shll $21, %r12d
-; AVX512DQ-NEXT:    orl %eax, %r12d
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k2
-; AVX512DQ-NEXT:    shll $22, %r14d
-; AVX512DQ-NEXT:    orl %r12d, %r14d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movl %r15d, %r12d
-; AVX512DQ-NEXT:    shll $23, %r12d
-; AVX512DQ-NEXT:    orl %r14d, %r12d
-; AVX512DQ-NEXT:    movl %r15d, %r13d
-; AVX512DQ-NEXT:    shll $24, %r13d
-; AVX512DQ-NEXT:    orl %r12d, %r13d
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512DQ-NEXT:    shll $25, %r15d
-; AVX512DQ-NEXT:    orl %r13d, %r15d
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movl %eax, %r12d
-; AVX512DQ-NEXT:    shll $26, %r12d
-; AVX512DQ-NEXT:    orl %r15d, %r12d
-; AVX512DQ-NEXT:    movl %eax, %r15d
-; AVX512DQ-NEXT:    shll $27, %r15d
-; AVX512DQ-NEXT:    orl %r12d, %r15d
-; AVX512DQ-NEXT:    shll $28, %eax
-; AVX512DQ-NEXT:    orl %r15d, %eax
-; AVX512DQ-NEXT:    movl %r14d, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movl %r13d, %r12d
-; AVX512DQ-NEXT:    shll $29, %r12d
-; AVX512DQ-NEXT:    orl %eax, %r12d
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512DQ-NEXT:    shll $30, %r13d
-; AVX512DQ-NEXT:    orl %r12d, %r13d
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k2
-; AVX512DQ-NEXT:    shll $31, %r14d
-; AVX512DQ-NEXT:    orl %r13d, %r14d
-; AVX512DQ-NEXT:    kmovw %k2, %r13d
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
-; AVX512DQ-NEXT:    orl %ebp, %r14d
-; AVX512DQ-NEXT:    movl %r14d, 8(%rsi)
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    movzbl %al, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r14
-; AVX512DQ-NEXT:    leaq (%r14,%rax,4), %rax
-; AVX512DQ-NEXT:    movzbl %r12b, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    leaq (%rax,%r14,8), %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $5, %r14
-; AVX512DQ-NEXT:    orq %r12, %r14
-; AVX512DQ-NEXT:    movzbl %r15b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $6, %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $7, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k2
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %bl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rbx
-; AVX512DQ-NEXT:    shlq $9, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %ebx
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k2
-; AVX512DQ-NEXT:    shlq $11, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %r11b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r11
-; AVX512DQ-NEXT:    shlq $12, %r11
-; AVX512DQ-NEXT:    orq %r12, %r11
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $13, %r12
-; AVX512DQ-NEXT:    orq %r11, %r12
-; AVX512DQ-NEXT:    kmovw %k2, %r11d
-; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %r10b, %ebp
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movq %rbp, %r10
-; AVX512DQ-NEXT:    shlq $15, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %r10d
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512DQ-NEXT:    shlq $17, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    movzbl %r9b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %rbp, %rax
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $19, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k1, %r9d
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512DQ-NEXT:    shlq $20, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    movzbl %r8b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $21, %r8
-; AVX512DQ-NEXT:    orq %r12, %r8
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $22, %r12
-; AVX512DQ-NEXT:    orq %r8, %r12
-; AVX512DQ-NEXT:    kmovw %k1, %r8d
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %dil, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rdi
-; AVX512DQ-NEXT:    shlq $24, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $25, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %edi
-; AVX512DQ-NEXT:    shlq $26, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %dl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $27, %rdx
-; AVX512DQ-NEXT:    orq %r12, %rdx
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $28, %r12
-; AVX512DQ-NEXT:    orq %rdx, %r12
-; AVX512DQ-NEXT:    shlq $29, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %cl, %ebp
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movq %rbp, %rdx
-; AVX512DQ-NEXT:    shlq $30, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    kmovw %k0, %r12d
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k0
-; AVX512DQ-NEXT:    shlq $31, %rbp
-; AVX512DQ-NEXT:    orq %rdx, %rbp
-; AVX512DQ-NEXT:    kmovw %k0, %edx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    shlq $32, %rcx
-; AVX512DQ-NEXT:    orq %rbp, %rcx
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $34, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $35, %r13
-; AVX512DQ-NEXT:    orq %rcx, %r13
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $37, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $38, %r15
-; AVX512DQ-NEXT:    orq %rcx, %r15
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $39, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %rbx, %rcx
-; AVX512DQ-NEXT:    shlq $40, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $41, %rbx
-; AVX512DQ-NEXT:    orq %rcx, %rbx
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r11, %rcx
-; AVX512DQ-NEXT:    shlq $43, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $44, %r11
-; AVX512DQ-NEXT:    orq %rcx, %r11
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $45, %rax
-; AVX512DQ-NEXT:    orq %r11, %rax
-; AVX512DQ-NEXT:    movq %r10, %rcx
-; AVX512DQ-NEXT:    shlq $46, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $47, %r10
-; AVX512DQ-NEXT:    orq %rcx, %r10
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $49, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $50, %r12
-; AVX512DQ-NEXT:    orq %rcx, %r12
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $51, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r9, %rcx
-; AVX512DQ-NEXT:    shlq $52, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $53, %r9
-; AVX512DQ-NEXT:    orq %rcx, %r9
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %r8, %rcx
-; AVX512DQ-NEXT:    shlq $55, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $56, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rdi, %rcx
-; AVX512DQ-NEXT:    shlq $58, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $59, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $60, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $61, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $62, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512DQ-NEXT:    shlq $63, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %rax, (%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r13
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm2
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm3
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k6
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor3_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    movq %rsi, %rdx
 ; AVX512BW-NEXT:    kmovd (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrd $21, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %esi
-; AVX512BW-NEXT:    kshiftrd $10, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrd $9, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    kshiftrd $8, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrd $7, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrd $6, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrd $5, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrd $4, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrd $3, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrd $22, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $23, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $24, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $25, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrd $26, %k0, %k1
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movl %esi, %ebp
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    leal (%rbp,%rbp,2), %ebp
-; AVX512BW-NEXT:    leal (%rbp,%r14,4), %ebp
-; AVX512BW-NEXT:    leal (%rbp,%r14,8), %ebp
-; AVX512BW-NEXT:    shll $4, %r14d
-; AVX512BW-NEXT:    orl %ebp, %r14d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movl %r15d, %ebp
-; AVX512BW-NEXT:    shll $5, %ebp
-; AVX512BW-NEXT:    orl %r14d, %ebp
-; AVX512BW-NEXT:    movl %r15d, %r14d
-; AVX512BW-NEXT:    shll $6, %r14d
-; AVX512BW-NEXT:    shll $7, %r15d
-; AVX512BW-NEXT:    orl %r14d, %r15d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movl %r12d, %r14d
-; AVX512BW-NEXT:    shll $8, %r14d
-; AVX512BW-NEXT:    orl %r15d, %r14d
-; AVX512BW-NEXT:    movl %r12d, %r15d
-; AVX512BW-NEXT:    shll $9, %r15d
-; AVX512BW-NEXT:    orl %r14d, %r15d
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrd $27, %k0, %k1
-; AVX512BW-NEXT:    shll $10, %r12d
-; AVX512BW-NEXT:    orl %r15d, %r12d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movl %r13d, %r14d
-; AVX512BW-NEXT:    shll $11, %r14d
-; AVX512BW-NEXT:    orl %r12d, %r14d
-; AVX512BW-NEXT:    movl %r13d, %r15d
-; AVX512BW-NEXT:    shll $12, %r15d
-; AVX512BW-NEXT:    orl %r14d, %r15d
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $28, %k0, %k1
-; AVX512BW-NEXT:    shll $13, %r13d
-; AVX512BW-NEXT:    orl %r15d, %r13d
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movl %eax, %r14d
-; AVX512BW-NEXT:    shll $14, %r14d
-; AVX512BW-NEXT:    orl %r13d, %r14d
-; AVX512BW-NEXT:    movl %eax, %r15d
-; AVX512BW-NEXT:    shll $15, %r15d
-; AVX512BW-NEXT:    orl %r14d, %r15d
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $29, %k0, %k1
-; AVX512BW-NEXT:    shll $16, %eax
-; AVX512BW-NEXT:    orl %r15d, %eax
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movl %r12d, %r15d
-; AVX512BW-NEXT:    shll $17, %r15d
-; AVX512BW-NEXT:    orl %eax, %r15d
-; AVX512BW-NEXT:    movl %r12d, %eax
-; AVX512BW-NEXT:    shll $18, %eax
-; AVX512BW-NEXT:    orl %r15d, %eax
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $30, %k0, %k1
-; AVX512BW-NEXT:    shll $19, %r12d
-; AVX512BW-NEXT:    orl %eax, %r12d
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movl %r14d, %eax
-; AVX512BW-NEXT:    shll $20, %eax
-; AVX512BW-NEXT:    orl %r12d, %eax
-; AVX512BW-NEXT:    movl %r14d, %r12d
-; AVX512BW-NEXT:    shll $21, %r12d
-; AVX512BW-NEXT:    orl %eax, %r12d
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrd $31, %k0, %k1
-; AVX512BW-NEXT:    shll $22, %r14d
-; AVX512BW-NEXT:    orl %r12d, %r14d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movl %r15d, %r12d
-; AVX512BW-NEXT:    shll $23, %r12d
-; AVX512BW-NEXT:    orl %r14d, %r12d
-; AVX512BW-NEXT:    movl %r15d, %r13d
-; AVX512BW-NEXT:    shll $24, %r13d
-; AVX512BW-NEXT:    orl %r12d, %r13d
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $2, %k0, %k1
-; AVX512BW-NEXT:    shll $25, %r15d
-; AVX512BW-NEXT:    orl %r13d, %r15d
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movl %eax, %r12d
-; AVX512BW-NEXT:    shll $26, %r12d
-; AVX512BW-NEXT:    orl %r15d, %r12d
-; AVX512BW-NEXT:    movl %eax, %r15d
-; AVX512BW-NEXT:    shll $27, %r15d
-; AVX512BW-NEXT:    orl %r12d, %r15d
-; AVX512BW-NEXT:    shll $28, %eax
-; AVX512BW-NEXT:    orl %r15d, %eax
-; AVX512BW-NEXT:    movl %r14d, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movl %r13d, %r12d
-; AVX512BW-NEXT:    shll $29, %r12d
-; AVX512BW-NEXT:    orl %eax, %r12d
-; AVX512BW-NEXT:    kmovd %k1, %r15d
 ; AVX512BW-NEXT:    kshiftrd $1, %k0, %k1
-; AVX512BW-NEXT:    shll $30, %r13d
-; AVX512BW-NEXT:    orl %r12d, %r13d
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $11, %k0, %k1
-; AVX512BW-NEXT:    shll $31, %r14d
-; AVX512BW-NEXT:    orl %r13d, %r14d
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrd $12, %k0, %k1
-; AVX512BW-NEXT:    orl %ebp, %r14d
-; AVX512BW-NEXT:    movq %rdx, %rbp
-; AVX512BW-NEXT:    movl %r14d, 8(%rdx)
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r14
-; AVX512BW-NEXT:    leaq (%r14,%rax,4), %rax
-; AVX512BW-NEXT:    movzbl %r12b, %r14d
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    leaq (%rax,%r14,8), %rax
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $4, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    shlq $5, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    movzbl %r15b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    shlq $6, %r15
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $13, %k0, %k1
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %bl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rbx
-; AVX512BW-NEXT:    shlq $9, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrd $14, %k0, %k1
-; AVX512BW-NEXT:    shlq $11, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %r11b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r11
-; AVX512BW-NEXT:    shlq $12, %r11
-; AVX512BW-NEXT:    orq %r12, %r11
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $13, %r12
-; AVX512BW-NEXT:    orq %r11, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrd $15, %k0, %k1
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %r10b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $15, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    movw $-3, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kandw %k3, %k0, %k2
+; AVX512BW-NEXT:    kmovq %k3, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k3
+; AVX512BW-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    movw $-5, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k4
+; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-9, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-17, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k5
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-33, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    movw $-65, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $2, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    movw $-129, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $3, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $4, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512BW-NEXT:    kmovd %eax, %k6
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $5, %k0, %k2
+; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $27, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k4
+; AVX512BW-NEXT:    kshiftrd $26, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovq %k7, %k2
+; AVX512BW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $28, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $11, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $29, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $8, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $30, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $5, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $31, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k7
+; AVX512BW-NEXT:    kshiftrw $2, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $21, %k0, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k6, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $22, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $23, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $24, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $25, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
 ; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
-; AVX512BW-NEXT:    shlq $17, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %r9b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $18, %r9
-; AVX512BW-NEXT:    orq %r12, %r9
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $19, %r12
-; AVX512BW-NEXT:    orq %r9, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrd $17, %k0, %k1
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %r8b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r8
-; AVX512BW-NEXT:    shlq $21, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrd $18, %k0, %k1
-; AVX512BW-NEXT:    shlq $23, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %dil, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $24, %rdi
-; AVX512BW-NEXT:    orq %r12, %rdi
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $25, %r12
-; AVX512BW-NEXT:    orq %rdi, %r12
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrd $19, %k0, %k1
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %r12
-; AVX512BW-NEXT:    shlq $27, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $28, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    shlq $29, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movzbl %cl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $30, %r12
-; AVX512BW-NEXT:    orq %rdx, %r12
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrd $20, %k0, %k0
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k0, %r12d
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    shlq $32, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $34, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $35, %r13
-; AVX512BW-NEXT:    orq %rcx, %r13
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $37, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $38, %r15
-; AVX512BW-NEXT:    orq %rcx, %r15
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $39, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %rbx, %rcx
-; AVX512BW-NEXT:    shlq $40, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $41, %rbx
-; AVX512BW-NEXT:    orq %rcx, %rbx
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r11, %rcx
-; AVX512BW-NEXT:    shlq $43, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $44, %r11
-; AVX512BW-NEXT:    orq %rcx, %r11
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $45, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r10, %rcx
-; AVX512BW-NEXT:    shlq $46, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $47, %r10
-; AVX512BW-NEXT:    orq %rcx, %r10
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %r9, %rcx
-; AVX512BW-NEXT:    shlq $49, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $50, %r9
-; AVX512BW-NEXT:    orq %rcx, %r9
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $51, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r8, %rcx
-; AVX512BW-NEXT:    shlq $52, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $53, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdi, %rcx
-; AVX512BW-NEXT:    shlq $55, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $56, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $57, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movq %rdx, %rcx
-; AVX512BW-NEXT:    shlq $58, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $59, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $61, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $62, %r12
-; AVX512BW-NEXT:    orq %rcx, %r12
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    shlq $63, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %rax, (%rbp)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $17, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $18, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $19, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $20, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $11, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
+; AVX512BW-NEXT:    kshiftrd $10, %k0, %k4
+; AVX512BW-NEXT:    kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k4, %k2
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $12, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $11, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $13, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $8, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $7, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $14, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $5, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $4, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $15, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $14, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k2} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $14, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $6, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $13, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $7, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $10, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $8, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $7, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $9, %k0, %k0
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k2, %k0
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
+; AVX512BW-NEXT:    korw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <32 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
-  store <96 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+  %data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison)
+  store <96 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor3_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf64:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r13
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k3
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
-; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k0
-; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k2
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k2, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    leaq (%r12,%r12,2), %r13
-; AVX512F-ONLY-NEXT:    leaq (%r13,%r12,4), %r12
-; AVX512F-ONLY-NEXT:    movzbl %bl, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    leaq (%r12,%rbx,8), %r12
-; AVX512F-ONLY-NEXT:    movq %rbx, %r13
-; AVX512F-ONLY-NEXT:    shlq $4, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $5, %rbx
-; AVX512F-ONLY-NEXT:    orq %r13, %rbx
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $6, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %r13
-; AVX512F-ONLY-NEXT:    shlq $7, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $8, %r10
-; AVX512F-ONLY-NEXT:    orq %r13, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $9, %r13
-; AVX512F-ONLY-NEXT:    orq %r10, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $10, %r10
-; AVX512F-ONLY-NEXT:    orq %r13, %r10
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $11, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r13
-; AVX512F-ONLY-NEXT:    shlq $12, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $13, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $14, %r10
-; AVX512F-ONLY-NEXT:    orq %r12, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $15, %r13
-; AVX512F-ONLY-NEXT:    orq %r10, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $16, %r10
-; AVX512F-ONLY-NEXT:    orq %r13, %r10
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $17, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r13
-; AVX512F-ONLY-NEXT:    shlq $18, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $19, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $20, %r10
-; AVX512F-ONLY-NEXT:    orq %r12, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $21, %r13
-; AVX512F-ONLY-NEXT:    orq %r10, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $22, %r10
-; AVX512F-ONLY-NEXT:    orq %r13, %r10
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $23, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r13
-; AVX512F-ONLY-NEXT:    shlq $24, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $25, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k4
-; AVX512F-ONLY-NEXT:    shlq $26, %r10
-; AVX512F-ONLY-NEXT:    orq %r12, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $27, %r13
-; AVX512F-ONLY-NEXT:    orq %r10, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $28, %r10
-; AVX512F-ONLY-NEXT:    orq %r13, %r10
-; AVX512F-ONLY-NEXT:    shlq $29, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $30, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $31, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k3, %k4
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    shlq $32, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $33, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $34, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $36, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $37, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $38, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $39, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $40, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k3, %k3
-; AVX512F-ONLY-NEXT:    shlq $41, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $42, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $43, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k3
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $45, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $48, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $49, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k3
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k3
-; AVX512F-ONLY-NEXT:    shlq $53, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $55, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k3
-; AVX512F-ONLY-NEXT:    shlq $56, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $57, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $58, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k3
-; AVX512F-ONLY-NEXT:    shlq $59, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $60, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $61, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $62, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $63, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k3
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %rax, (%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %ebx
-; AVX512F-ONLY-NEXT:    movl %ebx, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    leaq (%r12,%rax,2), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%rax,4), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%rax,8), %rax
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $4, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $5, %r14
-; AVX512F-ONLY-NEXT:    orq %r13, %r14
-; AVX512F-ONLY-NEXT:    shlq $6, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $7, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $8, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $9, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $10, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $11, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $12, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $13, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $14, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $15, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $16, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $17, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $18, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $19, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $20, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $21, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $22, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $24, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $25, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $26, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $27, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $28, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $29, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $30, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $32, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $33, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $34, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $35, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $37, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $39, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $41, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $42, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $43, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $44, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $45, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $46, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $48, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $50, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $52, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $53, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k2, %k2
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $55, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $57, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $58, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $59, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $60, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movl %r11d, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $61, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $62, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %r11d
-; AVX512F-ONLY-NEXT:    shlq $63, %r11
-; AVX512F-ONLY-NEXT:    orq %rax, %r11
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512F-ONLY-NEXT:    orq %r14, %r11
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k1
-; AVX512F-ONLY-NEXT:    movq %r11, 16(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r15,2), %r11
-; AVX512F-ONLY-NEXT:    leaq (%r11,%r14,4), %r11
-; AVX512F-ONLY-NEXT:    leaq (%r11,%r14,8), %r11
-; AVX512F-ONLY-NEXT:    shlq $4, %r14
-; AVX512F-ONLY-NEXT:    orq %r11, %r14
-; AVX512F-ONLY-NEXT:    movzbl %al, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r11
-; AVX512F-ONLY-NEXT:    shlq $5, %r11
-; AVX512F-ONLY-NEXT:    orq %r14, %r11
-; AVX512F-ONLY-NEXT:    movq %rax, %r14
-; AVX512F-ONLY-NEXT:    shlq $6, %r14
-; AVX512F-ONLY-NEXT:    shlq $7, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %r13
-; AVX512F-ONLY-NEXT:    shlq $8, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r14, %r15
-; AVX512F-ONLY-NEXT:    shlq $9, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $10, %r14
-; AVX512F-ONLY-NEXT:    orq %r15, %r14
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $11, %r15
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $12, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $13, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $14, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $15, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $16, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $17, %r9
-; AVX512F-ONLY-NEXT:    orq %r12, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $18, %r12
-; AVX512F-ONLY-NEXT:    orq %r9, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $19, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r8
-; AVX512F-ONLY-NEXT:    shlq $20, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $21, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $22, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %dil, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $23, %rdi
-; AVX512F-ONLY-NEXT:    orq %r12, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $24, %r12
-; AVX512F-ONLY-NEXT:    orq %rdi, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $25, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %dl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rdx
-; AVX512F-ONLY-NEXT:    shlq $26, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $27, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
-; AVX512F-ONLY-NEXT:    shlq $28, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %cl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $29, %rcx
-; AVX512F-ONLY-NEXT:    orq %r12, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $30, %r12
-; AVX512F-ONLY-NEXT:    orq %rcx, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k0
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movq %rbp, %rcx
-; AVX512F-ONLY-NEXT:    shlq $32, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
-; AVX512F-ONLY-NEXT:    shlq $34, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %rbp, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $36, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $37, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $39, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $40, %r14
-; AVX512F-ONLY-NEXT:    orq %r12, %r14
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $41, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r14
-; AVX512F-ONLY-NEXT:    shlq $42, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    shlq $43, %r15
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r9, %r14
-; AVX512F-ONLY-NEXT:    shlq $45, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    shlq $46, %r9
-; AVX512F-ONLY-NEXT:    orq %r14, %r9
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $48, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $49, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $51, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $52, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $53, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $54, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $56, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $58, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $59, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $60, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $61, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    shlq $62, %r10
-; AVX512F-ONLY-NEXT:    orq %rcx, %r10
-; AVX512F-ONLY-NEXT:    shlq $63, %rbx
-; AVX512F-ONLY-NEXT:    orq %r10, %rbx
-; AVX512F-ONLY-NEXT:    orq %r11, %rbx
-; AVX512F-ONLY-NEXT:    movq %rbx, 8(%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r13
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm0
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm8
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm9
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm3
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm1, %zmm10
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm2, %zmm11
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm7, %zmm4
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm2, %zmm2
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm7, %zmm5
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm11 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm10 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 512(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 576(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 640(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 704(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor3_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r13
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
-; AVX512DQ-NEXT:    kmovw (%rdi), %k3
-; AVX512DQ-NEXT:    kmovw 2(%rdi), %k1
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
 ; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
-; AVX512DQ-NEXT:    kmovw 6(%rdi), %k2
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ecx
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %edx
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %edi
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r8d
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r9d
-; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r11d
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r14d
-; AVX512DQ-NEXT:    kshiftrw $10, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r10d
-; AVX512DQ-NEXT:    kshiftrw $1, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ebx
-; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k3, %r12d
-; AVX512DQ-NEXT:    movzbl %r12b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    leaq (%r12,%r12,2), %r13
-; AVX512DQ-NEXT:    leaq (%r13,%r12,4), %r12
-; AVX512DQ-NEXT:    movzbl %bl, %ebx
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    leaq (%r12,%rbx,8), %r12
-; AVX512DQ-NEXT:    movq %rbx, %r13
-; AVX512DQ-NEXT:    shlq $4, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    shlq $5, %rbx
-; AVX512DQ-NEXT:    orq %r13, %rbx
-; AVX512DQ-NEXT:    movzbl %r10b, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $6, %r12
-; AVX512DQ-NEXT:    movq %r10, %r13
-; AVX512DQ-NEXT:    shlq $7, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $4, %k3, %k4
-; AVX512DQ-NEXT:    shlq $8, %r10
-; AVX512DQ-NEXT:    orq %r13, %r10
-; AVX512DQ-NEXT:    movzbl %r12b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $9, %r13
-; AVX512DQ-NEXT:    orq %r10, %r13
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $10, %r10
-; AVX512DQ-NEXT:    orq %r13, %r10
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $5, %k3, %k4
-; AVX512DQ-NEXT:    shlq $11, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    movzbl %r13b, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r13
-; AVX512DQ-NEXT:    shlq $12, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $13, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $6, %k3, %k4
-; AVX512DQ-NEXT:    shlq $14, %r10
-; AVX512DQ-NEXT:    orq %r12, %r10
-; AVX512DQ-NEXT:    movzbl %r13b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $15, %r13
-; AVX512DQ-NEXT:    orq %r10, %r13
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $16, %r10
-; AVX512DQ-NEXT:    orq %r13, %r10
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $7, %k3, %k4
-; AVX512DQ-NEXT:    shlq $17, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    movzbl %r13b, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r13
-; AVX512DQ-NEXT:    shlq $18, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $19, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $8, %k3, %k4
-; AVX512DQ-NEXT:    shlq $20, %r10
-; AVX512DQ-NEXT:    orq %r12, %r10
-; AVX512DQ-NEXT:    movzbl %r13b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $21, %r13
-; AVX512DQ-NEXT:    orq %r10, %r13
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $22, %r10
-; AVX512DQ-NEXT:    orq %r13, %r10
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $9, %k3, %k4
-; AVX512DQ-NEXT:    shlq $23, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    movzbl %r13b, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r13
-; AVX512DQ-NEXT:    shlq $24, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $25, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k4
-; AVX512DQ-NEXT:    shlq $26, %r10
-; AVX512DQ-NEXT:    orq %r12, %r10
-; AVX512DQ-NEXT:    movzbl %r13b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $27, %r13
-; AVX512DQ-NEXT:    orq %r10, %r13
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $28, %r10
-; AVX512DQ-NEXT:    orq %r13, %r10
-; AVX512DQ-NEXT:    shlq $29, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    movzbl %r15b, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $30, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %r10d
-; AVX512DQ-NEXT:    kshiftrw $11, %k3, %k4
-; AVX512DQ-NEXT:    shlq $31, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $12, %k3, %k4
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    shlq $32, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $33, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $34, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k4
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $36, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $37, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $14, %k3, %k4
-; AVX512DQ-NEXT:    shlq $38, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $39, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $40, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $15, %k3, %k3
-; AVX512DQ-NEXT:    shlq $41, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $42, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $43, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    kmovw %k3, %r12d
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k3
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $45, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    shlq $47, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $48, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $49, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %r15d
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k3
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %r12d
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k3
-; AVX512DQ-NEXT:    shlq $53, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $55, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k3
-; AVX512DQ-NEXT:    shlq $56, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $57, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $58, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %r13d
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k3
-; AVX512DQ-NEXT:    shlq $59, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $60, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $61, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %r15d
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k3
-; AVX512DQ-NEXT:    shlq $62, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movzbl %r15b, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $63, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %r12d
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k3
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %rax, (%rsi)
-; AVX512DQ-NEXT:    movzbl %r12b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movzbl %r10b, %ebx
-; AVX512DQ-NEXT:    movl %ebx, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    leaq (%r12,%rax,2), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%rax,4), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%rax,8), %rax
-; AVX512DQ-NEXT:    movzbl %r14b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $4, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $5, %r14
-; AVX512DQ-NEXT:    orq %r13, %r14
-; AVX512DQ-NEXT:    shlq $6, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $7, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $8, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k3
-; AVX512DQ-NEXT:    shlq $9, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $10, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $11, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512DQ-NEXT:    shlq $12, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $13, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $14, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    shlq $15, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $16, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $17, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512DQ-NEXT:    shlq $18, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $19, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $20, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k3
-; AVX512DQ-NEXT:    shlq $21, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $22, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k3
-; AVX512DQ-NEXT:    shlq $24, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $25, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $26, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k3
-; AVX512DQ-NEXT:    shlq $27, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $28, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $29, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %r12d
-; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k3
-; AVX512DQ-NEXT:    shlq $30, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movzbl %r12b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $32, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k3
-; AVX512DQ-NEXT:    shlq $33, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $34, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $35, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %r13d
-; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k3
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $37, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %r12d
-; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k3
-; AVX512DQ-NEXT:    shlq $39, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $41, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k3
-; AVX512DQ-NEXT:    shlq $42, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $43, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $44, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %r13d
-; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k3
-; AVX512DQ-NEXT:    shlq $45, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $46, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %r12d
-; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k3
-; AVX512DQ-NEXT:    shlq $48, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $49, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $50, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k3
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $52, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $53, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %r13d
-; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k2
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $55, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
-; AVX512DQ-NEXT:    shlq $57, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $58, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $59, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    shlq $60, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movl %r11d, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $61, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512DQ-NEXT:    shlq $62, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512DQ-NEXT:    movzbl %r11b, %r11d
-; AVX512DQ-NEXT:    shlq $63, %r11
-; AVX512DQ-NEXT:    orq %rax, %r11
-; AVX512DQ-NEXT:    kmovw %k2, %r13d
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512DQ-NEXT:    orq %r14, %r11
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k1
-; AVX512DQ-NEXT:    movq %r11, 16(%rsi)
-; AVX512DQ-NEXT:    kmovw %k1, %r11d
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512DQ-NEXT:    movzbl %r11b, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    leaq (%r15,%r15,2), %r11
-; AVX512DQ-NEXT:    leaq (%r11,%r14,4), %r11
-; AVX512DQ-NEXT:    leaq (%r11,%r14,8), %r11
-; AVX512DQ-NEXT:    shlq $4, %r14
-; AVX512DQ-NEXT:    orq %r11, %r14
-; AVX512DQ-NEXT:    movzbl %al, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r11
-; AVX512DQ-NEXT:    shlq $5, %r11
-; AVX512DQ-NEXT:    orq %r14, %r11
-; AVX512DQ-NEXT:    movq %rax, %r14
-; AVX512DQ-NEXT:    shlq $6, %r14
-; AVX512DQ-NEXT:    shlq $7, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movzbl %r13b, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %r13
-; AVX512DQ-NEXT:    shlq $8, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r14, %r15
-; AVX512DQ-NEXT:    shlq $9, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    kmovw %k1, %r13d
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512DQ-NEXT:    shlq $10, %r14
-; AVX512DQ-NEXT:    orq %r15, %r14
-; AVX512DQ-NEXT:    movzbl %r12b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $11, %r15
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $12, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    kmovw %k1, %r14d
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512DQ-NEXT:    shlq $13, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $14, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $15, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %r15d
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512DQ-NEXT:    shlq $16, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %r9b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $17, %r9
-; AVX512DQ-NEXT:    orq %r12, %r9
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $18, %r12
-; AVX512DQ-NEXT:    orq %r9, %r12
-; AVX512DQ-NEXT:    kmovw %k1, %r9d
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512DQ-NEXT:    shlq $19, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %r8b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r8
-; AVX512DQ-NEXT:    shlq $20, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $21, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %r8d
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512DQ-NEXT:    shlq $22, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %dil, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $23, %rdi
-; AVX512DQ-NEXT:    orq %r12, %rdi
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $24, %r12
-; AVX512DQ-NEXT:    orq %rdi, %r12
-; AVX512DQ-NEXT:    kmovw %k1, %edi
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512DQ-NEXT:    shlq $25, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %dl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rdx
-; AVX512DQ-NEXT:    shlq $26, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $27, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %edx
-; AVX512DQ-NEXT:    shlq $28, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %cl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $29, %rcx
-; AVX512DQ-NEXT:    orq %r12, %rcx
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $30, %r12
-; AVX512DQ-NEXT:    orq %rcx, %r12
-; AVX512DQ-NEXT:    kmovw %k0, %ebp
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k0
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movq %rbp, %rcx
-; AVX512DQ-NEXT:    shlq $32, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    kmovw %k0, %ecx
-; AVX512DQ-NEXT:    shlq $34, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %rbp, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $36, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $37, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $39, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $40, %r14
-; AVX512DQ-NEXT:    orq %r12, %r14
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $41, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r15, %r14
-; AVX512DQ-NEXT:    shlq $42, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    shlq $43, %r15
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r9, %r14
-; AVX512DQ-NEXT:    shlq $45, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    shlq $46, %r9
-; AVX512DQ-NEXT:    orq %r14, %r9
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $48, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    shlq $49, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $51, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    shlq $52, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $53, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $54, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $56, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $58, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $59, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $60, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $61, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    shlq $62, %r10
-; AVX512DQ-NEXT:    orq %rcx, %r10
-; AVX512DQ-NEXT:    shlq $63, %rbx
-; AVX512DQ-NEXT:    orq %r10, %rbx
-; AVX512DQ-NEXT:    orq %r11, %rbx
-; AVX512DQ-NEXT:    movq %rbx, 8(%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r13
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm4
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm5
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm8
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm9
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm3
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm1, %zmm10
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm2, %zmm11
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm7, %zmm4
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm7, %zmm5
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
+; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm11 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm10 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 512(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 576(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 640(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 704(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor3_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
 ; AVX512BW-NEXT:    kmovq (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrq $31, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    kshiftrq $30, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrq $29, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrq $28, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrq $27, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrq $26, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrq $63, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrq $45, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $44, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $10, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrq $2, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r10d
 ; AVX512BW-NEXT:    kshiftrq $1, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrq $3, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k0, %r12d
-; AVX512BW-NEXT:    movzbl %r12b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    leaq (%r12,%r12,2), %r13
-; AVX512BW-NEXT:    leaq (%r13,%r12,4), %r12
-; AVX512BW-NEXT:    movzbl %bl, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    leaq (%r12,%rbx,8), %r12
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $4, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    shlq $5, %rbx
-; AVX512BW-NEXT:    orq %r13, %rbx
-; AVX512BW-NEXT:    movzbl %r10b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $6, %r12
-; AVX512BW-NEXT:    movq %r10, %r13
-; AVX512BW-NEXT:    shlq $7, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $4, %k0, %k1
-; AVX512BW-NEXT:    shlq $8, %r10
-; AVX512BW-NEXT:    orq %r13, %r10
-; AVX512BW-NEXT:    movzbl %r12b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $9, %r13
-; AVX512BW-NEXT:    orq %r10, %r13
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $10, %r10
-; AVX512BW-NEXT:    orq %r13, %r10
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $5, %k0, %k1
-; AVX512BW-NEXT:    shlq $11, %r12
-; AVX512BW-NEXT:    orq %r10, %r12
-; AVX512BW-NEXT:    movzbl %r13b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %r13
-; AVX512BW-NEXT:    shlq $12, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $13, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
-; AVX512BW-NEXT:    shlq $14, %r10
-; AVX512BW-NEXT:    orq %r12, %r10
-; AVX512BW-NEXT:    movzbl %r13b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $15, %r13
-; AVX512BW-NEXT:    orq %r10, %r13
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $16, %r10
-; AVX512BW-NEXT:    orq %r13, %r10
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $7, %k0, %k1
-; AVX512BW-NEXT:    shlq $17, %r12
-; AVX512BW-NEXT:    orq %r10, %r12
-; AVX512BW-NEXT:    movzbl %r13b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %r13
-; AVX512BW-NEXT:    shlq $18, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $19, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $8, %k0, %k1
-; AVX512BW-NEXT:    shlq $20, %r10
-; AVX512BW-NEXT:    orq %r12, %r10
-; AVX512BW-NEXT:    movzbl %r13b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $21, %r13
-; AVX512BW-NEXT:    orq %r10, %r13
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $22, %r10
-; AVX512BW-NEXT:    orq %r13, %r10
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $9, %k0, %k1
-; AVX512BW-NEXT:    shlq $23, %r12
-; AVX512BW-NEXT:    orq %r10, %r12
-; AVX512BW-NEXT:    movzbl %r13b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %r13
-; AVX512BW-NEXT:    shlq $24, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $25, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $42, %k0, %k1
-; AVX512BW-NEXT:    shlq $26, %r10
-; AVX512BW-NEXT:    orq %r12, %r10
-; AVX512BW-NEXT:    movzbl %r13b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $27, %r13
-; AVX512BW-NEXT:    orq %r10, %r13
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $28, %r10
-; AVX512BW-NEXT:    orq %r13, %r10
-; AVX512BW-NEXT:    shlq $29, %r12
-; AVX512BW-NEXT:    orq %r10, %r12
-; AVX512BW-NEXT:    movzbl %r14b, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrq $11, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $12, %k0, %k1
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    shlq $32, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $33, %r12
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    shlq $34, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $13, %k0, %k1
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $36, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $37, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrq $14, %k0, %k1
-; AVX512BW-NEXT:    shlq $38, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $39, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $40, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $15, %k0, %k1
-; AVX512BW-NEXT:    shlq $41, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $42, %r12
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    shlq $43, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $16, %k0, %k1
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $45, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrq $17, %k0, %k1
-; AVX512BW-NEXT:    shlq $47, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $49, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $18, %k0, %k1
-; AVX512BW-NEXT:    shlq $50, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $51, %r12
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    shlq $52, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $19, %k0, %k1
-; AVX512BW-NEXT:    shlq $53, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $54, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $55, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrq $20, %k0, %k1
-; AVX512BW-NEXT:    shlq $56, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $57, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $58, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $21, %k0, %k1
-; AVX512BW-NEXT:    shlq $59, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $60, %r12
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    shlq $61, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $43, %k0, %k1
-; AVX512BW-NEXT:    shlq $62, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movzbl %r12b, %r14d
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $63, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $46, %k0, %k1
-; AVX512BW-NEXT:    orq %rbx, %r12
-; AVX512BW-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT:    movq %r12, (%rsi)
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movzbl %r10b, %ebx
-; AVX512BW-NEXT:    movl %ebx, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    leaq (%r12,%rax,2), %r12
-; AVX512BW-NEXT:    leaq (%r12,%rax,4), %r12
-; AVX512BW-NEXT:    leaq (%r12,%rax,8), %rax
-; AVX512BW-NEXT:    movzbl %r15b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $4, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $5, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    shlq $6, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $7, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $8, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $47, %k0, %k1
-; AVX512BW-NEXT:    shlq $9, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $10, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $11, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $48, %k0, %k1
-; AVX512BW-NEXT:    shlq $12, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $13, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $14, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $49, %k0, %k1
-; AVX512BW-NEXT:    shlq $15, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $16, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $17, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $50, %k0, %k1
-; AVX512BW-NEXT:    shlq $18, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $19, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $20, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $51, %k0, %k1
-; AVX512BW-NEXT:    shlq $21, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $22, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $23, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $52, %k0, %k1
-; AVX512BW-NEXT:    shlq $24, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $25, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $26, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $53, %k0, %k1
-; AVX512BW-NEXT:    shlq $27, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $28, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $29, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $54, %k0, %k1
-; AVX512BW-NEXT:    shlq $30, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %r12b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $32, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $55, %k0, %k1
-; AVX512BW-NEXT:    shlq $33, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $34, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $35, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $56, %k0, %k1
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $37, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $57, %k0, %k1
-; AVX512BW-NEXT:    shlq $39, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $41, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $58, %k0, %k1
-; AVX512BW-NEXT:    shlq $42, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $43, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $44, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    movw $-3, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kandw %k3, %k0, %k2
+; AVX512BW-NEXT:    kmovq %k3, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k3
+; AVX512BW-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    movw $-5, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k4
+; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-9, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-17, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-33, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    movw $-65, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $2, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    movw $-129, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kmovq %k3, %k5
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $3, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $4, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512BW-NEXT:    kmovd %eax, %k6
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $5, %k0, %k2
+; AVX512BW-NEXT:    kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    kshiftrq $59, %k0, %k1
-; AVX512BW-NEXT:    shlq $45, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $46, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $47, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $60, %k0, %k1
-; AVX512BW-NEXT:    shlq $48, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $49, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $50, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $61, %k0, %k1
-; AVX512BW-NEXT:    shlq $51, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $52, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $53, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $62, %k0, %k1
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $55, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $25, %k0, %k1
-; AVX512BW-NEXT:    shlq $57, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $59, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    shlq $60, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movl %r11d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $24, %k0, %k1
-; AVX512BW-NEXT:    shlq $62, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $23, %k0, %k1
-; AVX512BW-NEXT:    movzbl %r11b, %r13d
-; AVX512BW-NEXT:    shlq $63, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrq $22, %k0, %k1
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512BW-NEXT:    movq %r13, 16(%rsi)
-; AVX512BW-NEXT:    kmovd %k1, %esi
-; AVX512BW-NEXT:    kshiftrq $33, %k0, %k1
-; AVX512BW-NEXT:    movzbl %r15b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    leaq (%r14,%r14,2), %r14
-; AVX512BW-NEXT:    leaq (%r14,%rax,4), %r14
-; AVX512BW-NEXT:    leaq (%r14,%rax,8), %r14
-; AVX512BW-NEXT:    shlq $4, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movzbl %r11b, %r14d
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %r11
-; AVX512BW-NEXT:    shlq $5, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    shlq $7, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movzbl %r12b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    shlq $8, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrq $34, %k0, %k1
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $11, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $35, %k0, %k1
-; AVX512BW-NEXT:    shlq $13, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %r9b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $14, %r9
-; AVX512BW-NEXT:    orq %r12, %r9
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $15, %r12
-; AVX512BW-NEXT:    orq %r9, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrq $36, %k0, %k1
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %r8b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r8
-; AVX512BW-NEXT:    shlq $17, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $58, %k0, %k1
+; AVX512BW-NEXT:    kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    kmovq %k7, %k3
+; AVX512BW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $60, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k2, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $61, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $62, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $63, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k7
+; AVX512BW-NEXT:    kshiftrw $2, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $53, %k0, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k6, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $54, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $55, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $56, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $57, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $48, %k0, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k3, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $49, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $50, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $51, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $52, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $43, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $42, %k0, %k3
+; AVX512BW-NEXT:    kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k3, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $44, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $45, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $46, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $47, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm4 {%k1} {z}
 ; AVX512BW-NEXT:    kshiftrq $37, %k0, %k1
-; AVX512BW-NEXT:    shlq $19, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %dil, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $20, %rdi
-; AVX512BW-NEXT:    orq %r12, %rdi
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $21, %r12
-; AVX512BW-NEXT:    orq %rdi, %r12
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrq $38, %k0, %k1
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %dl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rdx
-; AVX512BW-NEXT:    shlq $23, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrq $39, %k0, %k1
-; AVX512BW-NEXT:    shlq $25, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %cl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $26, %rcx
-; AVX512BW-NEXT:    orq %r12, %rcx
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $27, %r12
-; AVX512BW-NEXT:    orq %rcx, %r12
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrq $40, %k0, %k1
-; AVX512BW-NEXT:    shlq $28, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rbp
-; AVX512BW-NEXT:    shlq $29, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $41, %k0, %k0
-; AVX512BW-NEXT:    shlq $31, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %esi
-; AVX512BW-NEXT:    movq %rsi, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rsi, %r13
-; AVX512BW-NEXT:    shlq $33, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k0, %r12d
-; AVX512BW-NEXT:    shlq $34, %rsi
-; AVX512BW-NEXT:    orq %r13, %rsi
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %rsi, %rax
-; AVX512BW-NEXT:    movq %r14, %rsi
-; AVX512BW-NEXT:    shlq $36, %rsi
-; AVX512BW-NEXT:    orq %rax, %rsi
-; AVX512BW-NEXT:    shlq $37, %r14
-; AVX512BW-NEXT:    orq %rsi, %r14
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r15, %rsi
-; AVX512BW-NEXT:    shlq $39, %rsi
-; AVX512BW-NEXT:    orq %rax, %rsi
-; AVX512BW-NEXT:    shlq $40, %r15
-; AVX512BW-NEXT:    orq %rsi, %r15
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $41, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r9, %rsi
-; AVX512BW-NEXT:    shlq $42, %rsi
-; AVX512BW-NEXT:    orq %rax, %rsi
-; AVX512BW-NEXT:    shlq $43, %r9
-; AVX512BW-NEXT:    orq %rsi, %r9
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r8, %rsi
-; AVX512BW-NEXT:    shlq $45, %rsi
-; AVX512BW-NEXT:    orq %rax, %rsi
-; AVX512BW-NEXT:    shlq $46, %r8
-; AVX512BW-NEXT:    orq %rsi, %r8
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $47, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdi, %rsi
-; AVX512BW-NEXT:    shlq $48, %rsi
-; AVX512BW-NEXT:    orq %rax, %rsi
-; AVX512BW-NEXT:    shlq $49, %rdi
-; AVX512BW-NEXT:    orq %rsi, %rdi
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movq %rdx, %rsi
-; AVX512BW-NEXT:    shlq $51, %rsi
-; AVX512BW-NEXT:    orq %rax, %rsi
-; AVX512BW-NEXT:    shlq $52, %rdx
-; AVX512BW-NEXT:    orq %rsi, %rdx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $53, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $54, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $55, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rcx
-; AVX512BW-NEXT:    shlq $57, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $58, %rbp
-; AVX512BW-NEXT:    orq %rcx, %rbp
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $59, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $60, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $61, %r12
-; AVX512BW-NEXT:    orq %rcx, %r12
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    shlq $62, %r10
-; AVX512BW-NEXT:    orq %r12, %r10
-; AVX512BW-NEXT:    shlq $63, %rbx
-; AVX512BW-NEXT:    orq %r10, %rbx
-; AVX512BW-NEXT:    orq %r11, %rbx
-; AVX512BW-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT:    movq %rbx, 8(%rax)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    retq
-  %src.vec = load <64 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
-  store <192 x i1> %tgt.mask, ptr %out.vec, align 64
-  ret void
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrq $38, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrq $39, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrq $40, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrq $41, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm5 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k0, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $33, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $34, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $7, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $35, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $4, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $36, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $27, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $26, %k0, %k3
+; AVX512BW-NEXT:    kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k3, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $28, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $29, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $30, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $31, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm7 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $21, %k0, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrq $22, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrq $23, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrq $24, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrq $25, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm8 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $16, %k0, %k2
+; AVX512BW-NEXT:    kandw %k5, %k2, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $17, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $18, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $7, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $19, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $4, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $20, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $11, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $10, %k0, %k3
+; AVX512BW-NEXT:    kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $12, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $13, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $7, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $14, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $15, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm10 {%k2} {z}
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $6, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $7, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $8, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $7, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $9, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k2, %k0
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
+; AVX512BW-NEXT:    korw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm11 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 512(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 576(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 640(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 704(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+  %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison)
+  store <192 x i32> %data, ptr %out.vec, align 64
+  ret void
 }
 
-define void @mask_replication_factor4_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-SLOW-LABEL: mask_replication_factor4_vf2:
 ; AVX512F-SLOW:       # %bb.0:
 ; AVX512F-SLOW-NEXT:    kmovw (%rdi), %k1
@@ -4832,9 +2423,9 @@ define void @mask_replication_factor4_vf2(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512F-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512F-SLOW-NEXT:    kmovw %k0, %eax
-; AVX512F-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512F-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512F-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
@@ -4845,143 +2436,155 @@ define void @mask_replication_factor4_vf2(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
 ; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512F-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512F-FAST-NEXT:    kmovw %k0, %eax
-; AVX512F-FAST-NEXT:    movb %al, (%rsi)
+; AVX512F-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512F-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-FAST-NEXT:    vzeroupper
 ; AVX512F-FAST-NEXT:    retq
 ;
 ; AVX512DQ-SLOW-LABEL: mask_replication_factor4_vf2:
 ; AVX512DQ-SLOW:       # %bb.0:
-; AVX512DQ-SLOW-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-SLOW-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %ymm0
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512DQ-SLOW-NEXT:    vpmovd2m %ymm0, %k0
-; AVX512DQ-SLOW-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %ymm0, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
 ; AVX512DQ-SLOW-NEXT:    retq
 ;
 ; AVX512DQ-FAST-LABEL: mask_replication_factor4_vf2:
 ; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-FAST-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
 ; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512DQ-FAST-NEXT:    vpmovd2m %ymm0, %k0
-; AVX512DQ-FAST-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-FAST-NEXT:    vpmovd2m %ymm0, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-FAST-NEXT:    vzeroupper
 ; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-SLOW-LABEL: mask_replication_factor4_vf2:
 ; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-SLOW-NEXT:    kmovq (%rdi), %k1
 ; AVX512BW-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512BW-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512BW-SLOW-NEXT:    kmovd %k0, %eax
-; AVX512BW-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512BW-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512BW-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512BW-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512BW-SLOW-NEXT:    vzeroupper
 ; AVX512BW-SLOW-NEXT:    retq
 ;
 ; AVX512BW-FAST-LABEL: mask_replication_factor4_vf2:
 ; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-FAST-NEXT:    kmovq (%rdi), %k1
 ; AVX512BW-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512BW-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
 ; AVX512BW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512BW-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512BW-FAST-NEXT:    kmovd %k0, %eax
-; AVX512BW-FAST-NEXT:    movb %al, (%rsi)
+; AVX512BW-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512BW-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512BW-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512BW-FAST-NEXT:    vzeroupper
 ; AVX512BW-FAST-NEXT:    retq
 ;
 ; AVX512VBMI-SLOW-LABEL: mask_replication_factor4_vf2:
 ; AVX512VBMI-SLOW:       # %bb.0:
-; AVX512VBMI-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512VBMI-SLOW-NEXT:    kmovq (%rdi), %k1
 ; AVX512VBMI-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512VBMI-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512VBMI-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; AVX512VBMI-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512VBMI-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512VBMI-SLOW-NEXT:    kmovd %k0, %eax
-; AVX512VBMI-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512VBMI-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512VBMI-SLOW-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512VBMI-SLOW-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512VBMI-SLOW-NEXT:    vzeroupper
 ; AVX512VBMI-SLOW-NEXT:    retq
 ;
 ; AVX512VBMI-FAST-LABEL: mask_replication_factor4_vf2:
 ; AVX512VBMI-FAST:       # %bb.0:
-; AVX512VBMI-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512VBMI-FAST-NEXT:    kmovq (%rdi), %k1
 ; AVX512VBMI-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512VBMI-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX512VBMI-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
 ; AVX512VBMI-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512VBMI-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
-; AVX512VBMI-FAST-NEXT:    kmovd %k0, %eax
-; AVX512VBMI-FAST-NEXT:    movb %al, (%rsi)
+; AVX512VBMI-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512VBMI-FAST-NEXT:    vmovdqa32 (%rsi), %ymm0 {%k1} {z}
+; AVX512VBMI-FAST-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512VBMI-FAST-NEXT:    vzeroupper
 ; AVX512VBMI-FAST-NEXT:    retq
-  %src.vec = load <2 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
-  store <8 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
+  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  %data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison)
+  %data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor4_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf4:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor4_vf4:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor4_vf4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    kmovq (%rdi), %k1
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
 ; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512BW-NEXT:    kmovw %k0, (%rsi)
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <4 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-  store <16 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+  %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
+  store <16 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor4_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf8:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
-; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
@@ -4989,14 +2592,16 @@ define void @mask_replication_factor4_vf8(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovb (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -5006,37 +2611,47 @@ define void @mask_replication_factor4_vf8(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, (%rsi)
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <8 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
-  store <32 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+  %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
+  store <32 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor4_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf16:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
-; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
-; AVX512F-ONLY-NEXT:    kmovw %k3, 4(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k2, 6(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k4
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
@@ -5044,124 +2659,170 @@ define void @mask_replication_factor4_vf16(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
-; AVX512DQ-NEXT:    kmovw %k3, 4(%rsi)
-; AVX512DQ-NEXT:    kmovw %k2, 6(%rsi)
-; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16:
 ; AVX512BW-ONLY:       # %bb.0:
-; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
-; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512BW-ONLY-NEXT:    vzeroupper
 ; AVX512BW-ONLY-NEXT:    retq
 ;
 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16:
 ; AVX512VBMI-ONLY:       # %bb.0:
-; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vzeroupper
 ; AVX512VBMI-ONLY-NEXT:    retq
-  %src.vec = load <16 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
-  store <64 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+  %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
+  store <64 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor4_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf32:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k3
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k4
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
-; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm3
-; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm4
-; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k2
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k3
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm4, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k4
-; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k5
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm1
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
-; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k6
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k7
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm4, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k7
-; AVX512F-ONLY-NEXT:    kmovw %k7, 12(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k6, 14(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k5, 8(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k3, 10(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k4, 4(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k2, 6(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k1, (%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k7} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k5} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512F-ONLY-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor4_vf32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
-; AVX512DQ-NEXT:    kmovw 2(%rdi), %k3
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
-; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm3
-; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm4
-; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k3
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm4, %zmm0
 ; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
-; AVX512DQ-NEXT:    vpmovm2d %k3, %zmm0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k6
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k7
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm4, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k7
-; AVX512DQ-NEXT:    kmovw %k7, 12(%rsi)
-; AVX512DQ-NEXT:    kmovw %k6, 14(%rsi)
-; AVX512DQ-NEXT:    kmovw %k5, 8(%rsi)
-; AVX512DQ-NEXT:    kmovw %k3, 10(%rsi)
-; AVX512DQ-NEXT:    kmovw %k4, 4(%rsi)
-; AVX512DQ-NEXT:    kmovw %k2, 6(%rsi)
-; AVX512DQ-NEXT:    kmovw %k1, (%rsi)
-; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k7} {z}
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -5169,15 +2830,35 @@ define void @mask_replication_factor4_vf32(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512BW-ONLY:       # %bb.0:
 ; AVX512BW-ONLY-NEXT:    kmovd (%rdi), %k0
 ; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
-; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k0
-; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3,2,3,2,3]
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
-; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT:    kmovq %k1, 8(%rsi)
-; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512BW-ONLY-NEXT:    vzeroupper
 ; AVX512BW-ONLY-NEXT:    retq
 ;
@@ -5185,39 +2866,61 @@ define void @mask_replication_factor4_vf32(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512VBMI-ONLY:       # %bb.0:
 ; AVX512VBMI-ONLY-NEXT:    kmovd (%rdi), %k0
 ; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
-; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k0
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT:    kmovq %k1, 8(%rsi)
-; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vzeroupper
 ; AVX512VBMI-ONLY-NEXT:    retq
-  %src.vec = load <32 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
-  store <128 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+  %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
+  store <128 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor4_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf64:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
-; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm4
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm0
 ; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm3, %zmm10
 ; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm5, %zmm11
@@ -5232,57 +2935,73 @@ define void @mask_replication_factor4_vf64(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm5, %zmm5
 ; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm7, %zmm7
 ; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm9, %zmm9
-; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 28(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 30(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 24(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 26(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 20(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 22(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 16(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 18(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 12(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 14(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 8(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 10(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 4(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 6(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm9 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm13 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm12 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm8 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 960(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 896(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 832(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 768(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 576(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 512(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, (%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor4_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
-; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm4
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm0
 ; AVX512DQ-NEXT:    vpermd %zmm1, %zmm3, %zmm10
 ; AVX512DQ-NEXT:    vpermd %zmm1, %zmm5, %zmm11
@@ -5297,38 +3016,54 @@ define void @mask_replication_factor4_vf64(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512DQ-NEXT:    vpermd %zmm16, %zmm5, %zmm5
 ; AVX512DQ-NEXT:    vpermd %zmm16, %zmm7, %zmm7
 ; AVX512DQ-NEXT:    vpermd %zmm16, %zmm9, %zmm9
-; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 28(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 30(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 24(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 26(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 20(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 22(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 16(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 18(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 12(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 14(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 8(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 10(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 4(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 6(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k0
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm9 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm13 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
+; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm12 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
+; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
+; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
+; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm8 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
+; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 960(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 896(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 832(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 768(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 576(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 512(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -5336,23 +3071,63 @@ define void @mask_replication_factor4_vf64(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512BW-ONLY:       # %bb.0:
 ; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7]
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
-; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k0
-; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
-; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
 ; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7]
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
 ; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k2
-; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5,4,5,4,5]
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
+; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k3
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
-; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k3
-; AVX512BW-ONLY-NEXT:    kmovq %k3, 16(%rsi)
-; AVX512BW-ONLY-NEXT:    kmovq %k2, 24(%rsi)
-; AVX512BW-ONLY-NEXT:    kmovq %k1, 8(%rsi)
-; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k4
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k4} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k4, %k4
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512BW-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k3, %k3
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512BW-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm15, 896(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm14, 960(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm13, 768(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm12, 832(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm9, 512(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm8, 576(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512BW-ONLY-NEXT:    vzeroupper
 ; AVX512BW-ONLY-NEXT:    retq
 ;
@@ -5360,23354 +3135,10042 @@ define void @mask_replication_factor4_vf64(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512VBMI-ONLY:       # %bb.0:
 ; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
-; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k0
-; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [48,48,48,48,49,49,49,49,50,50,50,50,51,51,51,51,52,52,52,52,53,53,53,53,54,54,54,54,55,55,55,55,56,56,56,56,57,57,57,57,58,58,58,58,59,59,59,59,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
 ; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [48,48,48,48,49,49,49,49,50,50,50,50,51,51,51,51,52,52,52,52,53,53,53,53,54,54,54,54,55,55,55,55,56,56,56,56,57,57,57,57,58,58,58,58,59,59,59,59,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,33,33,33,33,34,34,34,34,35,35,35,35,36,36,36,36,37,37,37,37,38,38,38,38,39,39,39,39,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,44,44,44,44,45,45,45,45,46,46,46,46,47,47,47,47]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
 ; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k2
-; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,33,33,33,33,34,34,34,34,35,35,35,35,36,36,36,36,37,37,37,37,38,38,38,38,39,39,39,39,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,44,44,44,44,45,45,45,45,46,46,46,46,47,47,47,47]
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k3
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k3
-; AVX512VBMI-ONLY-NEXT:    kmovq %k3, 16(%rsi)
-; AVX512VBMI-ONLY-NEXT:    kmovq %k2, 24(%rsi)
-; AVX512VBMI-ONLY-NEXT:    kmovq %k1, 8(%rsi)
-; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k4
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k4} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k4, %k4
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k3, %k3
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm15, 896(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm14, 960(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm13, 768(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm12, 832(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm9, 512(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm8, 576(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vzeroupper
 ; AVX512VBMI-ONLY-NEXT:    retq
-  %src.vec = load <64 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
-  store <256 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+  %data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison)
+  store <256 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor5_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf2:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512F-ONLY-NEXT:    shll $4, %eax
-; AVX512F-ONLY-NEXT:    orl %ecx, %eax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ecx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $5, %edi
-; AVX512F-ONLY-NEXT:    orl %eax, %edi
-; AVX512F-ONLY-NEXT:    movl %edx, %eax
-; AVX512F-ONLY-NEXT:    shll $6, %eax
-; AVX512F-ONLY-NEXT:    movl %edx, %r8d
-; AVX512F-ONLY-NEXT:    shll $7, %r8d
-; AVX512F-ONLY-NEXT:    orl %eax, %r8d
-; AVX512F-ONLY-NEXT:    shll $8, %edx
-; AVX512F-ONLY-NEXT:    orl %r8d, %edx
-; AVX512F-ONLY-NEXT:    shll $9, %ecx
-; AVX512F-ONLY-NEXT:    orl %edx, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    andl $1023, %ecx # imm = 0x3FF
-; AVX512F-ONLY-NEXT:    movw %cx, (%rsi)
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u>
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-ONLY-NEXT:    movw $1023, %ax # imm = 0x3FF
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-ONLY-NEXT:    vmovq %xmm1, 32(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor5_vf2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k0
-; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k0, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512DQ-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512DQ-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512DQ-NEXT:    shll $4, %eax
-; AVX512DQ-NEXT:    orl %ecx, %eax
-; AVX512DQ-NEXT:    kmovw %k1, %ecx
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $5, %edi
-; AVX512DQ-NEXT:    orl %eax, %edi
-; AVX512DQ-NEXT:    movl %edx, %eax
-; AVX512DQ-NEXT:    shll $6, %eax
-; AVX512DQ-NEXT:    movl %edx, %r8d
-; AVX512DQ-NEXT:    shll $7, %r8d
-; AVX512DQ-NEXT:    orl %eax, %r8d
-; AVX512DQ-NEXT:    shll $8, %edx
-; AVX512DQ-NEXT:    orl %r8d, %edx
-; AVX512DQ-NEXT:    shll $9, %ecx
-; AVX512DQ-NEXT:    orl %edx, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    andl $1023, %ecx # imm = 0x3FF
-; AVX512DQ-NEXT:    movw %cx, (%rsi)
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u>
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    movw $1023, %ax # imm = 0x3FF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512DQ-NEXT:    vmovq %xmm1, 32(%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor5_vf2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512BW-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512BW-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512BW-NEXT:    shll $4, %eax
-; AVX512BW-NEXT:    orl %ecx, %eax
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $5, %edi
-; AVX512BW-NEXT:    orl %eax, %edi
-; AVX512BW-NEXT:    movl %edx, %eax
-; AVX512BW-NEXT:    shll $6, %eax
-; AVX512BW-NEXT:    movl %edx, %r8d
-; AVX512BW-NEXT:    shll $7, %r8d
-; AVX512BW-NEXT:    orl %eax, %r8d
-; AVX512BW-NEXT:    shll $8, %edx
-; AVX512BW-NEXT:    orl %r8d, %edx
-; AVX512BW-NEXT:    shll $9, %ecx
-; AVX512BW-NEXT:    orl %edx, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    andl $1023, %ecx # imm = 0x3FF
-; AVX512BW-NEXT:    movw %cx, (%rsi)
+; AVX512BW-NEXT:    kmovq (%rdi), %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512BW-NEXT:    movw $1023, %ax # imm = 0x3FF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm1, 32(%rdx)
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <2 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
-  store <10 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
+  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %data = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr %in.vec, i32 64, <10 x i1> %tgt.mask, <10 x i32> poison)
+  %data.padded = shufflevector <10 x i32> %data, <10 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <10 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor5_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf4:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k2
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rcx,2), %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX512F-ONLY-NEXT:    shll $4, %ecx
-; AVX512F-ONLY-NEXT:    orl %eax, %ecx
-; AVX512F-ONLY-NEXT:    kmovw %k3, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movl %edx, %eax
-; AVX512F-ONLY-NEXT:    shll $5, %eax
-; AVX512F-ONLY-NEXT:    orl %ecx, %eax
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $6, %ecx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $7, %edi
-; AVX512F-ONLY-NEXT:    orl %ecx, %edi
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $8, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    shll $9, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movl %edi, %ecx
-; AVX512F-ONLY-NEXT:    shll $10, %ecx
-; AVX512F-ONLY-NEXT:    orl %edx, %ecx
-; AVX512F-ONLY-NEXT:    movl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $11, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    movl %edi, %ecx
-; AVX512F-ONLY-NEXT:    shll $12, %ecx
-; AVX512F-ONLY-NEXT:    orl %edx, %ecx
-; AVX512F-ONLY-NEXT:    movl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $13, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $14, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movl %edx, %r8d
-; AVX512F-ONLY-NEXT:    shll $15, %r8d
-; AVX512F-ONLY-NEXT:    orl %edi, %r8d
-; AVX512F-ONLY-NEXT:    orl %eax, %r8d
-; AVX512F-ONLY-NEXT:    movw %r8w, (%rsi)
-; AVX512F-ONLY-NEXT:    movl %edx, %eax
-; AVX512F-ONLY-NEXT:    shll $16, %eax
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $17, %edi
-; AVX512F-ONLY-NEXT:    orl %eax, %edi
-; AVX512F-ONLY-NEXT:    shll $18, %edx
-; AVX512F-ONLY-NEXT:    orl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $19, %ecx
-; AVX512F-ONLY-NEXT:    orl %edx, %ecx
-; AVX512F-ONLY-NEXT:    orl %r8d, %ecx
-; AVX512F-ONLY-NEXT:    shrl $16, %ecx
-; AVX512F-ONLY-NEXT:    andl $15, %ecx
-; AVX512F-ONLY-NEXT:    movb %cl, 2(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    movw $15, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa %xmm0, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor5_vf4:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k2
-; AVX512DQ-NEXT:    kshiftrb $3, %k2, %k0
-; AVX512DQ-NEXT:    kshiftrb $2, %k2, %k1
-; AVX512DQ-NEXT:    kshiftrb $1, %k2, %k3
-; AVX512DQ-NEXT:    kmovw %k2, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    leal (%rcx,%rcx,2), %eax
-; AVX512DQ-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX512DQ-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX512DQ-NEXT:    shll $4, %ecx
-; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    kmovw %k3, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movl %edx, %eax
-; AVX512DQ-NEXT:    shll $5, %eax
-; AVX512DQ-NEXT:    orl %ecx, %eax
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $6, %ecx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $7, %edi
-; AVX512DQ-NEXT:    orl %ecx, %edi
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $8, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    shll $9, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    kmovw %k1, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movl %edi, %ecx
-; AVX512DQ-NEXT:    shll $10, %ecx
-; AVX512DQ-NEXT:    orl %edx, %ecx
-; AVX512DQ-NEXT:    movl %edi, %edx
-; AVX512DQ-NEXT:    shll $11, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    movl %edi, %ecx
-; AVX512DQ-NEXT:    shll $12, %ecx
-; AVX512DQ-NEXT:    orl %edx, %ecx
-; AVX512DQ-NEXT:    movl %edi, %edx
-; AVX512DQ-NEXT:    shll $13, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    shll $14, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    kmovw %k0, %ecx
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movl %edx, %r8d
-; AVX512DQ-NEXT:    shll $15, %r8d
-; AVX512DQ-NEXT:    orl %edi, %r8d
-; AVX512DQ-NEXT:    orl %eax, %r8d
-; AVX512DQ-NEXT:    movw %r8w, (%rsi)
-; AVX512DQ-NEXT:    movl %edx, %eax
-; AVX512DQ-NEXT:    shll $16, %eax
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $17, %edi
-; AVX512DQ-NEXT:    orl %eax, %edi
-; AVX512DQ-NEXT:    shll $18, %edx
-; AVX512DQ-NEXT:    orl %edi, %edx
-; AVX512DQ-NEXT:    shll $19, %ecx
-; AVX512DQ-NEXT:    orl %edx, %ecx
-; AVX512DQ-NEXT:    orl %r8d, %ecx
-; AVX512DQ-NEXT:    shrl $16, %ecx
-; AVX512DQ-NEXT:    andl $15, %ecx
-; AVX512DQ-NEXT:    movb %cl, 2(%rsi)
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQ-NEXT:    movw $15, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %xmm0, 64(%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor5_vf4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k2
-; AVX512BW-NEXT:    kshiftrw $3, %k2, %k0
-; AVX512BW-NEXT:    kshiftrw $2, %k2, %k1
-; AVX512BW-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512BW-NEXT:    kmovd %k2, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    leal (%rcx,%rcx,2), %eax
-; AVX512BW-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX512BW-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX512BW-NEXT:    shll $4, %ecx
-; AVX512BW-NEXT:    orl %eax, %ecx
-; AVX512BW-NEXT:    kmovd %k3, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movl %edx, %eax
-; AVX512BW-NEXT:    shll $5, %eax
-; AVX512BW-NEXT:    orl %ecx, %eax
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $6, %ecx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $7, %edi
-; AVX512BW-NEXT:    orl %ecx, %edi
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $8, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    shll $9, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movl %edi, %ecx
-; AVX512BW-NEXT:    shll $10, %ecx
-; AVX512BW-NEXT:    orl %edx, %ecx
-; AVX512BW-NEXT:    movl %edi, %edx
-; AVX512BW-NEXT:    shll $11, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    movl %edi, %ecx
-; AVX512BW-NEXT:    shll $12, %ecx
-; AVX512BW-NEXT:    orl %edx, %ecx
-; AVX512BW-NEXT:    movl %edi, %edx
-; AVX512BW-NEXT:    shll $13, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    shll $14, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movl %edx, %r8d
-; AVX512BW-NEXT:    shll $15, %r8d
-; AVX512BW-NEXT:    orl %edi, %r8d
-; AVX512BW-NEXT:    orl %eax, %r8d
-; AVX512BW-NEXT:    movw %r8w, (%rsi)
-; AVX512BW-NEXT:    movl %edx, %eax
-; AVX512BW-NEXT:    shll $16, %eax
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $17, %edi
-; AVX512BW-NEXT:    orl %eax, %edi
-; AVX512BW-NEXT:    shll $18, %edx
-; AVX512BW-NEXT:    orl %edi, %edx
-; AVX512BW-NEXT:    shll $19, %ecx
-; AVX512BW-NEXT:    orl %edx, %ecx
-; AVX512BW-NEXT:    orl %r8d, %ecx
-; AVX512BW-NEXT:    shrl $16, %ecx
-; AVX512BW-NEXT:    andl $15, %ecx
-; AVX512BW-NEXT:    movb %cl, 2(%rsi)
+; AVX512BW-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    movl $1048575, %eax # imm = 0xFFFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %xmm0, 64(%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <4 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
-  store <20 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %data = call <20 x i32> @llvm.masked.load.v20i32.p0(ptr %in.vec, i32 64, <20 x i1> %tgt.mask, <20 x i32> poison)
+  %data.padded = shufflevector <20 x i32> %data, <20 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <20 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor5_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf8:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %rbx
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r11d
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    leaq (%rbx,%rbx,2), %r11
-; AVX512F-ONLY-NEXT:    leaq (%r11,%rbx,4), %r11
-; AVX512F-ONLY-NEXT:    leaq (%r11,%rbx,8), %r11
-; AVX512F-ONLY-NEXT:    shlq $4, %rbx
-; AVX512F-ONLY-NEXT:    orq %r11, %rbx
-; AVX512F-ONLY-NEXT:    movzbl %dl, %r11d
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    movq %r11, %rdx
-; AVX512F-ONLY-NEXT:    shlq $5, %rdx
-; AVX512F-ONLY-NEXT:    orq %rbx, %rdx
-; AVX512F-ONLY-NEXT:    movq %r11, %rbx
-; AVX512F-ONLY-NEXT:    shlq $6, %rbx
-; AVX512F-ONLY-NEXT:    movq %r11, %r14
-; AVX512F-ONLY-NEXT:    shlq $7, %r14
-; AVX512F-ONLY-NEXT:    orq %rbx, %r14
-; AVX512F-ONLY-NEXT:    movq %r11, %rbx
-; AVX512F-ONLY-NEXT:    shlq $8, %rbx
-; AVX512F-ONLY-NEXT:    orq %r14, %rbx
-; AVX512F-ONLY-NEXT:    shlq $9, %r11
-; AVX512F-ONLY-NEXT:    orq %rbx, %r11
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rbx
-; AVX512F-ONLY-NEXT:    shlq $10, %rbx
-; AVX512F-ONLY-NEXT:    orq %r11, %rbx
-; AVX512F-ONLY-NEXT:    movq %r10, %r11
-; AVX512F-ONLY-NEXT:    shlq $11, %r11
-; AVX512F-ONLY-NEXT:    orq %rbx, %r11
-; AVX512F-ONLY-NEXT:    movq %r10, %rbx
-; AVX512F-ONLY-NEXT:    shlq $12, %rbx
-; AVX512F-ONLY-NEXT:    orq %r11, %rbx
-; AVX512F-ONLY-NEXT:    movq %r10, %r11
-; AVX512F-ONLY-NEXT:    shlq $13, %r11
-; AVX512F-ONLY-NEXT:    orq %rbx, %r11
-; AVX512F-ONLY-NEXT:    shlq $14, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $15, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $16, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $17, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $18, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    shlq $19, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %r10
-; AVX512F-ONLY-NEXT:    shlq $20, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $21, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %r8, %r10
-; AVX512F-ONLY-NEXT:    shlq $22, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $23, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    shlq $24, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    movzbl %dil, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %rdi
-; AVX512F-ONLY-NEXT:    shlq $25, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $26, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %r9, %rdi
-; AVX512F-ONLY-NEXT:    shlq $27, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $28, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $29, %r9
-; AVX512F-ONLY-NEXT:    orq %r8, %r9
-; AVX512F-ONLY-NEXT:    movzbl %cl, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $30, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $31, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movl %edi, (%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $32, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $33, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $34, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    movzbl %al, %edx
-; AVX512F-ONLY-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $35, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $36, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $37, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $39, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    shrq $32, %rdx
-; AVX512F-ONLY-NEXT:    movb %dl, 4(%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
+; AVX512F-ONLY-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512F-ONLY-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-ONLY-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa %ymm1, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor5_vf8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %rbx
 ; AVX512DQ-NEXT:    kmovb (%rdi), %k0
-; AVX512DQ-NEXT:    kshiftrb $7, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    kshiftrb $6, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %ecx
-; AVX512DQ-NEXT:    kshiftrb $5, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %edi
-; AVX512DQ-NEXT:    kshiftrb $4, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r8d
-; AVX512DQ-NEXT:    kshiftrb $3, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r9d
-; AVX512DQ-NEXT:    kshiftrb $2, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r10d
-; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %edx
-; AVX512DQ-NEXT:    kmovw %k0, %r11d
-; AVX512DQ-NEXT:    movzbl %r11b, %ebx
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    leaq (%rbx,%rbx,2), %r11
-; AVX512DQ-NEXT:    leaq (%r11,%rbx,4), %r11
-; AVX512DQ-NEXT:    leaq (%r11,%rbx,8), %r11
-; AVX512DQ-NEXT:    shlq $4, %rbx
-; AVX512DQ-NEXT:    orq %r11, %rbx
-; AVX512DQ-NEXT:    movzbl %dl, %r11d
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    movq %r11, %rdx
-; AVX512DQ-NEXT:    shlq $5, %rdx
-; AVX512DQ-NEXT:    orq %rbx, %rdx
-; AVX512DQ-NEXT:    movq %r11, %rbx
-; AVX512DQ-NEXT:    shlq $6, %rbx
-; AVX512DQ-NEXT:    movq %r11, %r14
-; AVX512DQ-NEXT:    shlq $7, %r14
-; AVX512DQ-NEXT:    orq %rbx, %r14
-; AVX512DQ-NEXT:    movq %r11, %rbx
-; AVX512DQ-NEXT:    shlq $8, %rbx
-; AVX512DQ-NEXT:    orq %r14, %rbx
-; AVX512DQ-NEXT:    shlq $9, %r11
-; AVX512DQ-NEXT:    orq %rbx, %r11
-; AVX512DQ-NEXT:    movzbl %r10b, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rbx
-; AVX512DQ-NEXT:    shlq $10, %rbx
-; AVX512DQ-NEXT:    orq %r11, %rbx
-; AVX512DQ-NEXT:    movq %r10, %r11
-; AVX512DQ-NEXT:    shlq $11, %r11
-; AVX512DQ-NEXT:    orq %rbx, %r11
-; AVX512DQ-NEXT:    movq %r10, %rbx
-; AVX512DQ-NEXT:    shlq $12, %rbx
-; AVX512DQ-NEXT:    orq %r11, %rbx
-; AVX512DQ-NEXT:    movq %r10, %r11
-; AVX512DQ-NEXT:    shlq $13, %r11
-; AVX512DQ-NEXT:    orq %rbx, %r11
-; AVX512DQ-NEXT:    shlq $14, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    movzbl %r9b, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $15, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $16, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $17, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $18, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    shlq $19, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movzbl %r8b, %r8d
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %r10
-; AVX512DQ-NEXT:    shlq $20, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $21, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %r8, %r10
-; AVX512DQ-NEXT:    shlq $22, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $23, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    shlq $24, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    movzbl %dil, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %rdi
-; AVX512DQ-NEXT:    shlq $25, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %r9, %r8
-; AVX512DQ-NEXT:    shlq $26, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %r9, %rdi
-; AVX512DQ-NEXT:    shlq $27, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %r9, %r8
-; AVX512DQ-NEXT:    shlq $28, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $29, %r9
-; AVX512DQ-NEXT:    orq %r8, %r9
-; AVX512DQ-NEXT:    movzbl %cl, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $30, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    shlq $31, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movl %edi, (%rsi)
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $32, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $33, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $34, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    movzbl %al, %edx
-; AVX512DQ-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $35, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $36, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $37, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    shlq $39, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    shrq $32, %rdx
-; AVX512DQ-NEXT:    movb %dl, 4(%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm1
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
+; AVX512DQ-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpmovd2m %ymm0, %k3
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm1, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-LABEL: mask_replication_factor5_vf8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovw (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kmovd %k0, %r11d
-; AVX512BW-NEXT:    movzbl %r11b, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    leaq (%rbx,%rbx,2), %r11
-; AVX512BW-NEXT:    leaq (%r11,%rbx,4), %r11
-; AVX512BW-NEXT:    leaq (%r11,%rbx,8), %r11
-; AVX512BW-NEXT:    shlq $4, %rbx
-; AVX512BW-NEXT:    orq %r11, %rbx
-; AVX512BW-NEXT:    movzbl %dl, %r11d
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rdx
-; AVX512BW-NEXT:    shlq $5, %rdx
-; AVX512BW-NEXT:    orq %rbx, %rdx
-; AVX512BW-NEXT:    movq %r11, %rbx
-; AVX512BW-NEXT:    shlq $6, %rbx
-; AVX512BW-NEXT:    movq %r11, %r14
-; AVX512BW-NEXT:    shlq $7, %r14
-; AVX512BW-NEXT:    orq %rbx, %r14
-; AVX512BW-NEXT:    movq %r11, %rbx
-; AVX512BW-NEXT:    shlq $8, %rbx
-; AVX512BW-NEXT:    orq %r14, %rbx
-; AVX512BW-NEXT:    shlq $9, %r11
-; AVX512BW-NEXT:    orq %rbx, %r11
-; AVX512BW-NEXT:    movzbl %r10b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %rbx
-; AVX512BW-NEXT:    shlq $10, %rbx
-; AVX512BW-NEXT:    orq %r11, %rbx
-; AVX512BW-NEXT:    movq %r10, %r11
-; AVX512BW-NEXT:    shlq $11, %r11
-; AVX512BW-NEXT:    orq %rbx, %r11
-; AVX512BW-NEXT:    movq %r10, %rbx
-; AVX512BW-NEXT:    shlq $12, %rbx
-; AVX512BW-NEXT:    orq %r11, %rbx
-; AVX512BW-NEXT:    movq %r10, %r11
-; AVX512BW-NEXT:    shlq $13, %r11
-; AVX512BW-NEXT:    orq %rbx, %r11
-; AVX512BW-NEXT:    shlq $14, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    movzbl %r9b, %r9d
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $15, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $16, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $17, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $18, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    shlq $19, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movzbl %r8b, %r8d
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %r10
-; AVX512BW-NEXT:    shlq $20, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $21, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movq %r8, %r10
-; AVX512BW-NEXT:    shlq $22, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $23, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    shlq $24, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    movzbl %dil, %r9d
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %rdi
-; AVX512BW-NEXT:    shlq $25, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    movq %r9, %r8
-; AVX512BW-NEXT:    shlq $26, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %r9, %rdi
-; AVX512BW-NEXT:    shlq $27, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    movq %r9, %r8
-; AVX512BW-NEXT:    shlq $28, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    shlq $29, %r9
-; AVX512BW-NEXT:    orq %r8, %r9
-; AVX512BW-NEXT:    movzbl %cl, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $30, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    shlq $31, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movl %edi, (%rsi)
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $32, %rdx
-; AVX512BW-NEXT:    movq %rcx, %r8
-; AVX512BW-NEXT:    shlq $33, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    shlq $34, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    movzbl %al, %edx
-; AVX512BW-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $35, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $36, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $37, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    shlq $39, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    shrq $32, %rdx
-; AVX512BW-NEXT:    movb %dl, 4(%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    retq
-  %src.vec = load <8 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
-  store <40 x i1> %tgt.mask, ptr %out.vec, align 64
+; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf8:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-ONLY-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-ONLY-NEXT:    movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
+; AVX512BW-ONLY-NEXT:    kmovq %rax, %k1
+; AVX512BW-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa %ymm1, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf8:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI-ONLY-NEXT:    movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
+; AVX512VBMI-ONLY-NEXT:    kmovq %rax, %k1
+; AVX512VBMI-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa %ymm1, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %data = call <40 x i32> @llvm.masked.load.v40i32.p0(ptr %in.vec, i32 64, <40 x i1> %tgt.mask, <40 x i32> poison)
+  %data.padded = shufflevector <40 x i32> %data, <40 x i32> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <40 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor5_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf16:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r13
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    movl %edx, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    leal (%r14,%r11,2), %ebx
-; AVX512F-ONLY-NEXT:    leal (%rbx,%r11,4), %ebx
-; AVX512F-ONLY-NEXT:    leal (%rbx,%r11,8), %ebx
-; AVX512F-ONLY-NEXT:    movl %r11d, %r12d
-; AVX512F-ONLY-NEXT:    shll $4, %r12d
-; AVX512F-ONLY-NEXT:    orl %ebx, %r12d
-; AVX512F-ONLY-NEXT:    shll $5, %r11d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r11d
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movl %ebp, %ebx
-; AVX512F-ONLY-NEXT:    shll $6, %ebx
-; AVX512F-ONLY-NEXT:    movl %ebp, %r12d
-; AVX512F-ONLY-NEXT:    shll $7, %r12d
-; AVX512F-ONLY-NEXT:    orl %ebx, %r12d
-; AVX512F-ONLY-NEXT:    movl %ebp, %ebx
-; AVX512F-ONLY-NEXT:    shll $8, %ebx
-; AVX512F-ONLY-NEXT:    orl %r12d, %ebx
-; AVX512F-ONLY-NEXT:    movl %ebp, %r12d
-; AVX512F-ONLY-NEXT:    shll $9, %r12d
-; AVX512F-ONLY-NEXT:    orl %ebx, %r12d
-; AVX512F-ONLY-NEXT:    shll $10, %ebp
-; AVX512F-ONLY-NEXT:    orl %r12d, %ebp
-; AVX512F-ONLY-NEXT:    movl %eax, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movl %ebx, %r12d
-; AVX512F-ONLY-NEXT:    shll $11, %r12d
-; AVX512F-ONLY-NEXT:    orl %ebp, %r12d
-; AVX512F-ONLY-NEXT:    movl %ebx, %ebp
-; AVX512F-ONLY-NEXT:    shll $12, %ebp
-; AVX512F-ONLY-NEXT:    orl %r12d, %ebp
-; AVX512F-ONLY-NEXT:    movl %ebx, %r13d
-; AVX512F-ONLY-NEXT:    shll $13, %r13d
-; AVX512F-ONLY-NEXT:    orl %ebp, %r13d
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k1
-; AVX512F-ONLY-NEXT:    shll $14, %ebx
-; AVX512F-ONLY-NEXT:    orl %r13d, %ebx
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k1
-; AVX512F-ONLY-NEXT:    shll $15, %eax
-; AVX512F-ONLY-NEXT:    orl %ebx, %eax
-; AVX512F-ONLY-NEXT:    orl %r11d, %eax
-; AVX512F-ONLY-NEXT:    movw %ax, 8(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
-; AVX512F-ONLY-NEXT:    movzbl %al, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r11
-; AVX512F-ONLY-NEXT:    leaq (%r11,%rax,4), %r11
-; AVX512F-ONLY-NEXT:    leaq (%r11,%rax,8), %r11
-; AVX512F-ONLY-NEXT:    shlq $4, %rax
-; AVX512F-ONLY-NEXT:    orq %r11, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %ebp
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movq %rbp, %r10
-; AVX512F-ONLY-NEXT:    shlq $5, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %r11
-; AVX512F-ONLY-NEXT:    shlq $7, %r11
-; AVX512F-ONLY-NEXT:    orq %rax, %r11
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r11, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k0
-; AVX512F-ONLY-NEXT:    shlq $9, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %rbp, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r9
-; AVX512F-ONLY-NEXT:    shlq $11, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %rbp
-; AVX512F-ONLY-NEXT:    shlq $13, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r9d
-; AVX512F-ONLY-NEXT:    movzbl %dl, %edx
-; AVX512F-ONLY-NEXT:    shlq $14, %rbx
-; AVX512F-ONLY-NEXT:    orq %rbp, %rbx
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $15, %r8
-; AVX512F-ONLY-NEXT:    orq %rbx, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $16, %rbx
-; AVX512F-ONLY-NEXT:    orq %r8, %rbx
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $17, %r8
-; AVX512F-ONLY-NEXT:    orq %rbx, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $18, %rbx
-; AVX512F-ONLY-NEXT:    orq %r8, %rbx
-; AVX512F-ONLY-NEXT:    shlq $19, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $20, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $21, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $22, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $24, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $25, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $26, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $27, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $28, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    shlq $29, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movzbl %cl, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $30, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $31, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $33, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $34, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $36, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $37, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $38, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $39, %r15
-; AVX512F-ONLY-NEXT:    orq %rcx, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $41, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $43, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $44, %r12
-; AVX512F-ONLY-NEXT:    orq %rcx, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $45, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $46, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $48, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $49, %r13
-; AVX512F-ONLY-NEXT:    orq %rcx, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %rcx
-; AVX512F-ONLY-NEXT:    shlq $51, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %rcx
-; AVX512F-ONLY-NEXT:    shlq $53, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $54, %r11
-; AVX512F-ONLY-NEXT:    orq %rcx, %r11
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %r11, %rax
-; AVX512F-ONLY-NEXT:    movq %r9, %rcx
-; AVX512F-ONLY-NEXT:    shlq $56, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r9, %rcx
-; AVX512F-ONLY-NEXT:    shlq $58, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $59, %r9
-; AVX512F-ONLY-NEXT:    orq %rcx, %r9
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rcx
-; AVX512F-ONLY-NEXT:    shlq $61, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $62, %r14
-; AVX512F-ONLY-NEXT:    orq %rcx, %r14
-; AVX512F-ONLY-NEXT:    shlq $63, %rdx
-; AVX512F-ONLY-NEXT:    orq %r14, %rdx
-; AVX512F-ONLY-NEXT:    orq %r10, %rdx
-; AVX512F-ONLY-NEXT:    movq %rdx, (%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r13
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k5
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm1 {%k5} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor5_vf16:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r13
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %edx
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %ecx
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %edi
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r8d
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r9d
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r10d
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r11d
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %ebp
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r15d
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    movl %edx, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    leal (%r14,%r11,2), %ebx
-; AVX512DQ-NEXT:    leal (%rbx,%r11,4), %ebx
-; AVX512DQ-NEXT:    leal (%rbx,%r11,8), %ebx
-; AVX512DQ-NEXT:    movl %r11d, %r12d
-; AVX512DQ-NEXT:    shll $4, %r12d
-; AVX512DQ-NEXT:    orl %ebx, %r12d
-; AVX512DQ-NEXT:    shll $5, %r11d
-; AVX512DQ-NEXT:    orl %r12d, %r11d
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movl %ebp, %ebx
-; AVX512DQ-NEXT:    shll $6, %ebx
-; AVX512DQ-NEXT:    movl %ebp, %r12d
-; AVX512DQ-NEXT:    shll $7, %r12d
-; AVX512DQ-NEXT:    orl %ebx, %r12d
-; AVX512DQ-NEXT:    movl %ebp, %ebx
-; AVX512DQ-NEXT:    shll $8, %ebx
-; AVX512DQ-NEXT:    orl %r12d, %ebx
-; AVX512DQ-NEXT:    movl %ebp, %r12d
-; AVX512DQ-NEXT:    shll $9, %r12d
-; AVX512DQ-NEXT:    orl %ebx, %r12d
-; AVX512DQ-NEXT:    shll $10, %ebp
-; AVX512DQ-NEXT:    orl %r12d, %ebp
-; AVX512DQ-NEXT:    movl %eax, %ebx
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movl %ebx, %r12d
-; AVX512DQ-NEXT:    shll $11, %r12d
-; AVX512DQ-NEXT:    orl %ebp, %r12d
-; AVX512DQ-NEXT:    movl %ebx, %ebp
-; AVX512DQ-NEXT:    shll $12, %ebp
-; AVX512DQ-NEXT:    orl %r12d, %ebp
-; AVX512DQ-NEXT:    movl %ebx, %r13d
-; AVX512DQ-NEXT:    shll $13, %r13d
-; AVX512DQ-NEXT:    orl %ebp, %r13d
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k1
-; AVX512DQ-NEXT:    shll $14, %ebx
-; AVX512DQ-NEXT:    orl %r13d, %ebx
-; AVX512DQ-NEXT:    kmovw %k1, %r13d
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k1
-; AVX512DQ-NEXT:    shll $15, %eax
-; AVX512DQ-NEXT:    orl %ebx, %eax
-; AVX512DQ-NEXT:    orl %r11d, %eax
-; AVX512DQ-NEXT:    movw %ax, 8(%rsi)
-; AVX512DQ-NEXT:    kmovw %k0, %eax
-; AVX512DQ-NEXT:    movzbl %al, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r11
-; AVX512DQ-NEXT:    leaq (%r11,%rax,4), %r11
-; AVX512DQ-NEXT:    leaq (%r11,%rax,8), %r11
-; AVX512DQ-NEXT:    shlq $4, %rax
-; AVX512DQ-NEXT:    orq %r11, %rax
-; AVX512DQ-NEXT:    movzbl %r10b, %ebp
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movq %rbp, %r10
-; AVX512DQ-NEXT:    shlq $5, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movq %rbp, %r11
-; AVX512DQ-NEXT:    shlq $7, %r11
-; AVX512DQ-NEXT:    orq %rax, %r11
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r11, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %r11d
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k0
-; AVX512DQ-NEXT:    shlq $9, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    movzbl %r9b, %ebx
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %rbp, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r9
-; AVX512DQ-NEXT:    shlq $11, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %rbx, %rbp
-; AVX512DQ-NEXT:    shlq $13, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k0, %r9d
-; AVX512DQ-NEXT:    movzbl %dl, %edx
-; AVX512DQ-NEXT:    shlq $14, %rbx
-; AVX512DQ-NEXT:    orq %rbp, %rbx
-; AVX512DQ-NEXT:    movzbl %r8b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $15, %r8
-; AVX512DQ-NEXT:    orq %rbx, %r8
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $16, %rbx
-; AVX512DQ-NEXT:    orq %r8, %rbx
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $17, %r8
-; AVX512DQ-NEXT:    orq %rbx, %r8
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $18, %rbx
-; AVX512DQ-NEXT:    orq %r8, %rbx
-; AVX512DQ-NEXT:    shlq $19, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movzbl %dil, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $20, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $21, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $22, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    shlq $24, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $25, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $26, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $27, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $28, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    shlq $29, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movzbl %cl, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $30, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    shlq $31, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $33, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $34, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $36, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $37, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $38, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $39, %r15
-; AVX512DQ-NEXT:    orq %rcx, %r15
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $41, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $43, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $44, %r12
-; AVX512DQ-NEXT:    orq %rcx, %r12
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $45, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $46, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $48, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $49, %r13
-; AVX512DQ-NEXT:    orq %rcx, %r13
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r11, %rcx
-; AVX512DQ-NEXT:    shlq $51, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r11, %rcx
-; AVX512DQ-NEXT:    shlq $53, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $54, %r11
-; AVX512DQ-NEXT:    orq %rcx, %r11
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %r11, %rax
-; AVX512DQ-NEXT:    movq %r9, %rcx
-; AVX512DQ-NEXT:    shlq $56, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r9, %rcx
-; AVX512DQ-NEXT:    shlq $58, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $59, %r9
-; AVX512DQ-NEXT:    orq %rcx, %r9
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $60, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %r14, %rcx
-; AVX512DQ-NEXT:    shlq $61, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $62, %r14
-; AVX512DQ-NEXT:    orq %rcx, %r14
-; AVX512DQ-NEXT:    shlq $63, %rdx
-; AVX512DQ-NEXT:    orq %r14, %rdx
-; AVX512DQ-NEXT:    orq %r10, %rdx
-; AVX512DQ-NEXT:    movq %rdx, (%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r13
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k5
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm1 {%k5} {z}
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k4} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor5_vf16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovw (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrw $15, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movl %edx, %r14d
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    leal (%r14,%r11,2), %ebx
-; AVX512BW-NEXT:    leal (%rbx,%r11,4), %ebx
-; AVX512BW-NEXT:    leal (%rbx,%r11,8), %ebx
-; AVX512BW-NEXT:    movl %r11d, %r12d
-; AVX512BW-NEXT:    shll $4, %r12d
-; AVX512BW-NEXT:    orl %ebx, %r12d
-; AVX512BW-NEXT:    shll $5, %r11d
-; AVX512BW-NEXT:    orl %r12d, %r11d
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movl %ebp, %ebx
-; AVX512BW-NEXT:    shll $6, %ebx
-; AVX512BW-NEXT:    movl %ebp, %r12d
-; AVX512BW-NEXT:    shll $7, %r12d
-; AVX512BW-NEXT:    orl %ebx, %r12d
-; AVX512BW-NEXT:    movl %ebp, %ebx
-; AVX512BW-NEXT:    shll $8, %ebx
-; AVX512BW-NEXT:    orl %r12d, %ebx
-; AVX512BW-NEXT:    movl %ebp, %r12d
-; AVX512BW-NEXT:    shll $9, %r12d
-; AVX512BW-NEXT:    orl %ebx, %r12d
-; AVX512BW-NEXT:    shll $10, %ebp
-; AVX512BW-NEXT:    orl %r12d, %ebp
-; AVX512BW-NEXT:    movl %eax, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movl %ebx, %r12d
-; AVX512BW-NEXT:    shll $11, %r12d
-; AVX512BW-NEXT:    orl %ebp, %r12d
-; AVX512BW-NEXT:    movl %ebx, %ebp
-; AVX512BW-NEXT:    shll $12, %ebp
-; AVX512BW-NEXT:    orl %r12d, %ebp
-; AVX512BW-NEXT:    movl %ebx, %r13d
-; AVX512BW-NEXT:    shll $13, %r13d
-; AVX512BW-NEXT:    orl %ebp, %r13d
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrw $9, %k0, %k1
-; AVX512BW-NEXT:    shll $14, %ebx
-; AVX512BW-NEXT:    orl %r13d, %ebx
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrw $10, %k0, %k1
-; AVX512BW-NEXT:    shll $15, %eax
-; AVX512BW-NEXT:    orl %ebx, %eax
-; AVX512BW-NEXT:    orl %r11d, %eax
-; AVX512BW-NEXT:    movw %ax, 8(%rsi)
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r11
-; AVX512BW-NEXT:    leaq (%r11,%rax,4), %r11
-; AVX512BW-NEXT:    leaq (%r11,%rax,8), %r11
-; AVX512BW-NEXT:    shlq $4, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movzbl %r10b, %ebp
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %r10
-; AVX512BW-NEXT:    shlq $5, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %rbp, %r11
-; AVX512BW-NEXT:    shlq $7, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrw $11, %k0, %k0
-; AVX512BW-NEXT:    shlq $9, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    movzbl %r9b, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %rbx, %r9
-; AVX512BW-NEXT:    shlq $11, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %rbx, %rbp
-; AVX512BW-NEXT:    shlq $13, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k0, %r9d
-; AVX512BW-NEXT:    movzbl %dl, %edx
-; AVX512BW-NEXT:    shlq $14, %rbx
-; AVX512BW-NEXT:    orq %rbp, %rbx
-; AVX512BW-NEXT:    movzbl %r8b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $15, %r8
-; AVX512BW-NEXT:    orq %rbx, %r8
-; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    shlq $16, %rbx
-; AVX512BW-NEXT:    orq %r8, %rbx
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $17, %r8
-; AVX512BW-NEXT:    orq %rbx, %r8
-; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    shlq $18, %rbx
-; AVX512BW-NEXT:    orq %r8, %rbx
-; AVX512BW-NEXT:    shlq $19, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movzbl %dil, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $20, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $21, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $22, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $23, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    shlq $24, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $25, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $26, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $27, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $28, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    shlq $29, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movzbl %cl, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $30, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    shlq $31, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $33, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    shlq $34, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $36, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $37, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $38, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $39, %r15
-; AVX512BW-NEXT:    orq %rcx, %r15
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $41, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $43, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $44, %r12
-; AVX512BW-NEXT:    orq %rcx, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $45, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $46, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $47, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $48, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $49, %r13
-; AVX512BW-NEXT:    orq %rcx, %r13
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r11, %rcx
-; AVX512BW-NEXT:    shlq $51, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r11, %rcx
-; AVX512BW-NEXT:    shlq $53, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $54, %r11
-; AVX512BW-NEXT:    orq %rcx, %r11
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $55, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r9, %rcx
-; AVX512BW-NEXT:    shlq $56, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $57, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r9, %rcx
-; AVX512BW-NEXT:    shlq $58, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $59, %r9
-; AVX512BW-NEXT:    orq %rcx, %r9
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r14, %rcx
-; AVX512BW-NEXT:    shlq $61, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $62, %r14
-; AVX512BW-NEXT:    orq %rcx, %r14
-; AVX512BW-NEXT:    shlq $63, %rdx
-; AVX512BW-NEXT:    orq %r14, %rdx
-; AVX512BW-NEXT:    orq %r10, %rdx
-; AVX512BW-NEXT:    movq %rdx, (%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm3, %zmm3
+; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm4, %zmm4
+; AVX512BW-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm5, %zmm0
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <16 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
-  store <80 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %data = call <80 x i32> @llvm.masked.load.v80i32.p0(ptr %in.vec, i32 64, <80 x i1> %tgt.mask, <80 x i32> poison)
+  store <80 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor5_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf32:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r13
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k2
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movl %ecx, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    leal (%r9,%r9,2), %r13d
-; AVX512F-ONLY-NEXT:    leal (%r13,%r14,4), %r13d
-; AVX512F-ONLY-NEXT:    leal (%r13,%r14,8), %ebp
-; AVX512F-ONLY-NEXT:    movl %r14d, %r13d
-; AVX512F-ONLY-NEXT:    shll $4, %r13d
-; AVX512F-ONLY-NEXT:    orl %ebp, %r13d
-; AVX512F-ONLY-NEXT:    movl %r14d, %ebp
-; AVX512F-ONLY-NEXT:    shll $5, %ebp
-; AVX512F-ONLY-NEXT:    orl %r13d, %ebp
-; AVX512F-ONLY-NEXT:    shll $6, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
-; AVX512F-ONLY-NEXT:    shll $7, %r13d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r13d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r14d
-; AVX512F-ONLY-NEXT:    shll $8, %r14d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
-; AVX512F-ONLY-NEXT:    shll $9, %r13d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r13d
-; AVX512F-ONLY-NEXT:    movl %r15d, %eax
-; AVX512F-ONLY-NEXT:    shll $10, %eax
-; AVX512F-ONLY-NEXT:    orl %r13d, %eax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k2
-; AVX512F-ONLY-NEXT:    shll $11, %r15d
-; AVX512F-ONLY-NEXT:    orl %eax, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movl %r12d, %eax
-; AVX512F-ONLY-NEXT:    shll $12, %eax
-; AVX512F-ONLY-NEXT:    orl %r15d, %eax
-; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
-; AVX512F-ONLY-NEXT:    shll $13, %r15d
-; AVX512F-ONLY-NEXT:    orl %eax, %r15d
-; AVX512F-ONLY-NEXT:    movl %r12d, %eax
-; AVX512F-ONLY-NEXT:    shll $14, %eax
-; AVX512F-ONLY-NEXT:    orl %r15d, %eax
-; AVX512F-ONLY-NEXT:    movl %r12d, %r13d
-; AVX512F-ONLY-NEXT:    shll $15, %r13d
-; AVX512F-ONLY-NEXT:    orl %eax, %r13d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k2
-; AVX512F-ONLY-NEXT:    shll $16, %r12d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movl %r14d, %eax
-; AVX512F-ONLY-NEXT:    shll $17, %eax
-; AVX512F-ONLY-NEXT:    orl %r12d, %eax
-; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
-; AVX512F-ONLY-NEXT:    shll $18, %r12d
-; AVX512F-ONLY-NEXT:    orl %eax, %r12d
-; AVX512F-ONLY-NEXT:    movl %r14d, %eax
-; AVX512F-ONLY-NEXT:    shll $19, %eax
-; AVX512F-ONLY-NEXT:    orl %r12d, %eax
-; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
-; AVX512F-ONLY-NEXT:    shll $20, %r12d
-; AVX512F-ONLY-NEXT:    orl %eax, %r12d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k2
-; AVX512F-ONLY-NEXT:    shll $21, %r14d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    shll $22, %r12d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r12d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r14d
-; AVX512F-ONLY-NEXT:    shll $23, %r14d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    shll $24, %r12d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r12d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r14d
-; AVX512F-ONLY-NEXT:    shll $25, %r14d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
-; AVX512F-ONLY-NEXT:    shll $26, %r15d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r15d
-; AVX512F-ONLY-NEXT:    movl %eax, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
-; AVX512F-ONLY-NEXT:    shll $27, %r12d
-; AVX512F-ONLY-NEXT:    orl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    movl %r14d, %r15d
-; AVX512F-ONLY-NEXT:    shll $28, %r15d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r15d
-; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
-; AVX512F-ONLY-NEXT:    shll $29, %r12d
-; AVX512F-ONLY-NEXT:    orl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512F-ONLY-NEXT:    shll $30, %r14d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512F-ONLY-NEXT:    shll $31, %eax
-; AVX512F-ONLY-NEXT:    orl %r14d, %eax
-; AVX512F-ONLY-NEXT:    orl %ebp, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, 16(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    movzbl %al, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r14
-; AVX512F-ONLY-NEXT:    leaq (%r14,%rax,4), %r14
-; AVX512F-ONLY-NEXT:    leaq (%r14,%rax,8), %r14
-; AVX512F-ONLY-NEXT:    shlq $4, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r14
-; AVX512F-ONLY-NEXT:    shlq $5, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $9, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $11, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $13, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $14, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $15, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $16, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $17, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $18, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $19, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $20, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $21, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $22, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $23, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $24, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $25, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $26, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $27, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $28, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $29, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $30, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $32, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $34, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $36, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $37, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $38, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $39, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $43, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $44, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $45, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $46, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $48, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k1
-; AVX512F-ONLY-NEXT:    shlq $49, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $53, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $54, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $56, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $58, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $59, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r15d
-; AVX512F-ONLY-NEXT:    movl %ebp, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $61, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $62, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $63, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512F-ONLY-NEXT:    orq %r14, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r13, (%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %bl, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rbx,2), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,4), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,8), %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r15
-; AVX512F-ONLY-NEXT:    shlq $4, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $5, %rbx
-; AVX512F-ONLY-NEXT:    orq %r15, %rbx
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r11
-; AVX512F-ONLY-NEXT:    shlq $6, %r11
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $7, %r15
-; AVX512F-ONLY-NEXT:    orq %r11, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r11
-; AVX512F-ONLY-NEXT:    shlq $8, %r11
-; AVX512F-ONLY-NEXT:    orq %r15, %r11
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $9, %r13
-; AVX512F-ONLY-NEXT:    orq %r11, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k0
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %r11d
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    movq %r11, %r10
-; AVX512F-ONLY-NEXT:    shlq $11, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %r10
-; AVX512F-ONLY-NEXT:    shlq $13, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r10d
-; AVX512F-ONLY-NEXT:    shlq $15, %r11
-; AVX512F-ONLY-NEXT:    orq %rax, %r11
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $16, %r15
-; AVX512F-ONLY-NEXT:    orq %r11, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r11
-; AVX512F-ONLY-NEXT:    shlq $17, %r11
-; AVX512F-ONLY-NEXT:    orq %r15, %r11
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $18, %r15
-; AVX512F-ONLY-NEXT:    orq %r11, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r11
-; AVX512F-ONLY-NEXT:    shlq $19, %r11
-; AVX512F-ONLY-NEXT:    orq %r15, %r11
-; AVX512F-ONLY-NEXT:    shlq $20, %rax
-; AVX512F-ONLY-NEXT:    orq %r11, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %r11
-; AVX512F-ONLY-NEXT:    shlq $21, %r11
-; AVX512F-ONLY-NEXT:    orq %rax, %r11
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $22, %rax
-; AVX512F-ONLY-NEXT:    orq %r11, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %r11
-; AVX512F-ONLY-NEXT:    shlq $23, %r11
-; AVX512F-ONLY-NEXT:    orq %rax, %r11
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $24, %rax
-; AVX512F-ONLY-NEXT:    orq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $25, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movzbl %dil, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $26, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $27, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $28, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $29, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $30, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movzbl %dl, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $31, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $33, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %rdx
-; AVX512F-ONLY-NEXT:    shlq $37, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %rdx
-; AVX512F-ONLY-NEXT:    shlq $39, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $40, %rbp
-; AVX512F-ONLY-NEXT:    orq %rdx, %rbp
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $41, %rax
-; AVX512F-ONLY-NEXT:    orq %rbp, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rdx
-; AVX512F-ONLY-NEXT:    shlq $42, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $43, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rdx
-; AVX512F-ONLY-NEXT:    shlq $44, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $45, %r12
-; AVX512F-ONLY-NEXT:    orq %rdx, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rdx
-; AVX512F-ONLY-NEXT:    shlq $47, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rdx
-; AVX512F-ONLY-NEXT:    shlq $49, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $50, %r14
-; AVX512F-ONLY-NEXT:    orq %rdx, %r14
-; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $51, %rdx
-; AVX512F-ONLY-NEXT:    orq %r14, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $52, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $53, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $54, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rdx
-; AVX512F-ONLY-NEXT:    shlq $56, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %rdx
-; AVX512F-ONLY-NEXT:    shlq $58, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $59, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $61, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $62, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movzbl %cl, %eax
-; AVX512F-ONLY-NEXT:    shlq $63, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %rax, 8(%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r13
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm4
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm0
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm2
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm5, %zmm5
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm7
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm9, %zmm3
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm9 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 512(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 576(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor5_vf32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r13
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
-; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %ecx
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %edx
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %edi
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r8d
-; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r10d
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r11d
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %ebx
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k2
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movl %ecx, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    leal (%r9,%r9,2), %r13d
-; AVX512DQ-NEXT:    leal (%r13,%r14,4), %r13d
-; AVX512DQ-NEXT:    leal (%r13,%r14,8), %ebp
-; AVX512DQ-NEXT:    movl %r14d, %r13d
-; AVX512DQ-NEXT:    shll $4, %r13d
-; AVX512DQ-NEXT:    orl %ebp, %r13d
-; AVX512DQ-NEXT:    movl %r14d, %ebp
-; AVX512DQ-NEXT:    shll $5, %ebp
-; AVX512DQ-NEXT:    orl %r13d, %ebp
-; AVX512DQ-NEXT:    shll $6, %r14d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movl %r15d, %r13d
-; AVX512DQ-NEXT:    shll $7, %r13d
-; AVX512DQ-NEXT:    orl %r14d, %r13d
-; AVX512DQ-NEXT:    movl %r15d, %r14d
-; AVX512DQ-NEXT:    shll $8, %r14d
-; AVX512DQ-NEXT:    orl %r13d, %r14d
-; AVX512DQ-NEXT:    movl %r15d, %r13d
-; AVX512DQ-NEXT:    shll $9, %r13d
-; AVX512DQ-NEXT:    orl %r14d, %r13d
-; AVX512DQ-NEXT:    movl %r15d, %eax
-; AVX512DQ-NEXT:    shll $10, %eax
-; AVX512DQ-NEXT:    orl %r13d, %eax
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k2
-; AVX512DQ-NEXT:    shll $11, %r15d
-; AVX512DQ-NEXT:    orl %eax, %r15d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movl %r12d, %eax
-; AVX512DQ-NEXT:    shll $12, %eax
-; AVX512DQ-NEXT:    orl %r15d, %eax
-; AVX512DQ-NEXT:    movl %r12d, %r15d
-; AVX512DQ-NEXT:    shll $13, %r15d
-; AVX512DQ-NEXT:    orl %eax, %r15d
-; AVX512DQ-NEXT:    movl %r12d, %eax
-; AVX512DQ-NEXT:    shll $14, %eax
-; AVX512DQ-NEXT:    orl %r15d, %eax
-; AVX512DQ-NEXT:    movl %r12d, %r13d
-; AVX512DQ-NEXT:    shll $15, %r13d
-; AVX512DQ-NEXT:    orl %eax, %r13d
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k2
-; AVX512DQ-NEXT:    shll $16, %r12d
-; AVX512DQ-NEXT:    orl %r13d, %r12d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movl %r14d, %eax
-; AVX512DQ-NEXT:    shll $17, %eax
-; AVX512DQ-NEXT:    orl %r12d, %eax
-; AVX512DQ-NEXT:    movl %r14d, %r12d
-; AVX512DQ-NEXT:    shll $18, %r12d
-; AVX512DQ-NEXT:    orl %eax, %r12d
-; AVX512DQ-NEXT:    movl %r14d, %eax
-; AVX512DQ-NEXT:    shll $19, %eax
-; AVX512DQ-NEXT:    orl %r12d, %eax
-; AVX512DQ-NEXT:    movl %r14d, %r12d
-; AVX512DQ-NEXT:    shll $20, %r12d
-; AVX512DQ-NEXT:    orl %eax, %r12d
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
-; AVX512DQ-NEXT:    shll $21, %r14d
-; AVX512DQ-NEXT:    orl %r12d, %r14d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movl %r15d, %r12d
-; AVX512DQ-NEXT:    shll $22, %r12d
-; AVX512DQ-NEXT:    orl %r14d, %r12d
-; AVX512DQ-NEXT:    movl %r15d, %r14d
-; AVX512DQ-NEXT:    shll $23, %r14d
-; AVX512DQ-NEXT:    orl %r12d, %r14d
-; AVX512DQ-NEXT:    movl %r15d, %r12d
-; AVX512DQ-NEXT:    shll $24, %r12d
-; AVX512DQ-NEXT:    orl %r14d, %r12d
-; AVX512DQ-NEXT:    movl %r15d, %r14d
-; AVX512DQ-NEXT:    shll $25, %r14d
-; AVX512DQ-NEXT:    orl %r12d, %r14d
-; AVX512DQ-NEXT:    shll $26, %r15d
-; AVX512DQ-NEXT:    orl %r14d, %r15d
-; AVX512DQ-NEXT:    movl %eax, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movl %r14d, %r12d
-; AVX512DQ-NEXT:    shll $27, %r12d
-; AVX512DQ-NEXT:    orl %r15d, %r12d
-; AVX512DQ-NEXT:    movl %r14d, %r15d
-; AVX512DQ-NEXT:    shll $28, %r15d
-; AVX512DQ-NEXT:    orl %r12d, %r15d
-; AVX512DQ-NEXT:    movl %r14d, %r12d
-; AVX512DQ-NEXT:    shll $29, %r12d
-; AVX512DQ-NEXT:    orl %r15d, %r12d
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512DQ-NEXT:    shll $30, %r14d
-; AVX512DQ-NEXT:    orl %r12d, %r14d
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512DQ-NEXT:    shll $31, %eax
-; AVX512DQ-NEXT:    orl %r14d, %eax
-; AVX512DQ-NEXT:    orl %ebp, %eax
-; AVX512DQ-NEXT:    movl %eax, 16(%rsi)
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    movzbl %al, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r14
-; AVX512DQ-NEXT:    leaq (%r14,%rax,4), %r14
-; AVX512DQ-NEXT:    leaq (%r14,%rax,8), %r14
-; AVX512DQ-NEXT:    shlq $4, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movzbl %r12b, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r14
-; AVX512DQ-NEXT:    shlq $5, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $7, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512DQ-NEXT:    shlq $9, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $11, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $13, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512DQ-NEXT:    shlq $14, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $15, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $16, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $17, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $18, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512DQ-NEXT:    shlq $19, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $20, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $21, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $22, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $23, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
-; AVX512DQ-NEXT:    shlq $24, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $25, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $26, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $27, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $28, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $29, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %r15b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $30, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $32, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512DQ-NEXT:    shlq $34, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $36, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $37, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $38, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512DQ-NEXT:    shlq $39, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $43, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
-; AVX512DQ-NEXT:    shlq $44, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $45, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $46, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $48, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k1
-; AVX512DQ-NEXT:    shlq $49, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $53, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k1, %r13d
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512DQ-NEXT:    shlq $54, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $56, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $58, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    shlq $59, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r15d
-; AVX512DQ-NEXT:    movl %ebp, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $60, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $61, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %ebp
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512DQ-NEXT:    shlq $62, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $63, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512DQ-NEXT:    orq %r14, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %r14d
-; AVX512DQ-NEXT:    movq %r13, (%rsi)
-; AVX512DQ-NEXT:    movzbl %bl, %ebx
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    leaq (%r15,%rbx,2), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%rbx,4), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%rbx,8), %rax
-; AVX512DQ-NEXT:    movq %rbx, %r15
-; AVX512DQ-NEXT:    shlq $4, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    shlq $5, %rbx
-; AVX512DQ-NEXT:    orq %r15, %rbx
-; AVX512DQ-NEXT:    movzbl %r11b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r11
-; AVX512DQ-NEXT:    shlq $6, %r11
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $7, %r15
-; AVX512DQ-NEXT:    orq %r11, %r15
-; AVX512DQ-NEXT:    movq %rax, %r11
-; AVX512DQ-NEXT:    shlq $8, %r11
-; AVX512DQ-NEXT:    orq %r15, %r11
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $9, %r13
-; AVX512DQ-NEXT:    orq %r11, %r13
-; AVX512DQ-NEXT:    kmovw %k0, %r15d
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movzbl %r10b, %r11d
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    movq %r11, %r10
-; AVX512DQ-NEXT:    shlq $11, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r11, %r10
-; AVX512DQ-NEXT:    shlq $13, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    kmovw %k0, %r10d
-; AVX512DQ-NEXT:    shlq $15, %r11
-; AVX512DQ-NEXT:    orq %rax, %r11
-; AVX512DQ-NEXT:    movzbl %r15b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $16, %r15
-; AVX512DQ-NEXT:    orq %r11, %r15
-; AVX512DQ-NEXT:    movq %rax, %r11
-; AVX512DQ-NEXT:    shlq $17, %r11
-; AVX512DQ-NEXT:    orq %r15, %r11
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $18, %r15
-; AVX512DQ-NEXT:    orq %r11, %r15
-; AVX512DQ-NEXT:    movq %rax, %r11
-; AVX512DQ-NEXT:    shlq $19, %r11
-; AVX512DQ-NEXT:    orq %r15, %r11
-; AVX512DQ-NEXT:    shlq $20, %rax
-; AVX512DQ-NEXT:    orq %r11, %rax
-; AVX512DQ-NEXT:    movzbl %r8b, %r8d
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %r11
-; AVX512DQ-NEXT:    shlq $21, %r11
-; AVX512DQ-NEXT:    orq %rax, %r11
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $22, %rax
-; AVX512DQ-NEXT:    orq %r11, %rax
-; AVX512DQ-NEXT:    movq %r8, %r11
-; AVX512DQ-NEXT:    shlq $23, %r11
-; AVX512DQ-NEXT:    orq %rax, %r11
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $24, %rax
-; AVX512DQ-NEXT:    orq %r11, %rax
-; AVX512DQ-NEXT:    shlq $25, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movzbl %dil, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $26, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $27, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $28, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $29, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $30, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movzbl %dl, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $31, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $33, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $35, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %rbp, %rdx
-; AVX512DQ-NEXT:    shlq $37, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %rbp, %rdx
-; AVX512DQ-NEXT:    shlq $39, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $40, %rbp
-; AVX512DQ-NEXT:    orq %rdx, %rbp
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $41, %rax
-; AVX512DQ-NEXT:    orq %rbp, %rax
-; AVX512DQ-NEXT:    movq %r12, %rdx
-; AVX512DQ-NEXT:    shlq $42, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $43, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rdx
-; AVX512DQ-NEXT:    shlq $44, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $45, %r12
-; AVX512DQ-NEXT:    orq %rdx, %r12
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %rdx
-; AVX512DQ-NEXT:    shlq $47, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %r14, %rdx
-; AVX512DQ-NEXT:    shlq $49, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $50, %r14
-; AVX512DQ-NEXT:    orq %rdx, %r14
-; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $51, %rdx
-; AVX512DQ-NEXT:    orq %r14, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $52, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $53, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $54, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rdx
-; AVX512DQ-NEXT:    shlq $56, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %r10, %rdx
-; AVX512DQ-NEXT:    shlq $58, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $59, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $60, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $61, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    shlq $62, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movzbl %cl, %eax
-; AVX512DQ-NEXT:    shlq $63, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %rax, 8(%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r13
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm9, %zmm3
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm9 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
+; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 512(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 576(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor5_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
 ; AVX512BW-NEXT:    kmovd (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrd $25, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrd $19, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    kshiftrd $18, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrd $17, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrd $15, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrd $14, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrd $13, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrd $26, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $27, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $28, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $29, %k0, %k1
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movl %ecx, %r9d
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    leal (%r9,%r9,2), %r13d
-; AVX512BW-NEXT:    leal (%r13,%r14,4), %r13d
-; AVX512BW-NEXT:    leal (%r13,%r14,8), %ebp
-; AVX512BW-NEXT:    movl %r14d, %r13d
-; AVX512BW-NEXT:    shll $4, %r13d
-; AVX512BW-NEXT:    orl %ebp, %r13d
-; AVX512BW-NEXT:    movl %r14d, %ebp
-; AVX512BW-NEXT:    shll $5, %ebp
-; AVX512BW-NEXT:    orl %r13d, %ebp
-; AVX512BW-NEXT:    shll $6, %r14d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movl %r15d, %r13d
-; AVX512BW-NEXT:    shll $7, %r13d
-; AVX512BW-NEXT:    orl %r14d, %r13d
-; AVX512BW-NEXT:    movl %r15d, %r14d
-; AVX512BW-NEXT:    shll $8, %r14d
-; AVX512BW-NEXT:    orl %r13d, %r14d
-; AVX512BW-NEXT:    movl %r15d, %r13d
-; AVX512BW-NEXT:    shll $9, %r13d
-; AVX512BW-NEXT:    orl %r14d, %r13d
-; AVX512BW-NEXT:    movl %r15d, %eax
-; AVX512BW-NEXT:    shll $10, %eax
-; AVX512BW-NEXT:    orl %r13d, %eax
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $30, %k0, %k1
-; AVX512BW-NEXT:    shll $11, %r15d
-; AVX512BW-NEXT:    orl %eax, %r15d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movl %r12d, %eax
-; AVX512BW-NEXT:    shll $12, %eax
-; AVX512BW-NEXT:    orl %r15d, %eax
-; AVX512BW-NEXT:    movl %r12d, %r15d
-; AVX512BW-NEXT:    shll $13, %r15d
-; AVX512BW-NEXT:    orl %eax, %r15d
-; AVX512BW-NEXT:    movl %r12d, %eax
-; AVX512BW-NEXT:    shll $14, %eax
-; AVX512BW-NEXT:    orl %r15d, %eax
-; AVX512BW-NEXT:    movl %r12d, %r13d
-; AVX512BW-NEXT:    shll $15, %r13d
-; AVX512BW-NEXT:    orl %eax, %r13d
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $31, %k0, %k1
-; AVX512BW-NEXT:    shll $16, %r12d
-; AVX512BW-NEXT:    orl %r13d, %r12d
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movl %r14d, %eax
-; AVX512BW-NEXT:    shll $17, %eax
-; AVX512BW-NEXT:    orl %r12d, %eax
-; AVX512BW-NEXT:    movl %r14d, %r12d
-; AVX512BW-NEXT:    shll $18, %r12d
-; AVX512BW-NEXT:    orl %eax, %r12d
-; AVX512BW-NEXT:    movl %r14d, %eax
-; AVX512BW-NEXT:    shll $19, %eax
-; AVX512BW-NEXT:    orl %r12d, %eax
-; AVX512BW-NEXT:    movl %r14d, %r12d
-; AVX512BW-NEXT:    shll $20, %r12d
-; AVX512BW-NEXT:    orl %eax, %r12d
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrd $6, %k0, %k1
-; AVX512BW-NEXT:    shll $21, %r14d
-; AVX512BW-NEXT:    orl %r12d, %r14d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movl %r15d, %r12d
-; AVX512BW-NEXT:    shll $22, %r12d
-; AVX512BW-NEXT:    orl %r14d, %r12d
-; AVX512BW-NEXT:    movl %r15d, %r14d
-; AVX512BW-NEXT:    shll $23, %r14d
-; AVX512BW-NEXT:    orl %r12d, %r14d
-; AVX512BW-NEXT:    movl %r15d, %r12d
-; AVX512BW-NEXT:    shll $24, %r12d
-; AVX512BW-NEXT:    orl %r14d, %r12d
-; AVX512BW-NEXT:    movl %r15d, %r14d
-; AVX512BW-NEXT:    shll $25, %r14d
-; AVX512BW-NEXT:    orl %r12d, %r14d
-; AVX512BW-NEXT:    shll $26, %r15d
-; AVX512BW-NEXT:    orl %r14d, %r15d
-; AVX512BW-NEXT:    movl %eax, %r14d
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movl %r14d, %r12d
-; AVX512BW-NEXT:    shll $27, %r12d
-; AVX512BW-NEXT:    orl %r15d, %r12d
-; AVX512BW-NEXT:    movl %r14d, %r15d
-; AVX512BW-NEXT:    shll $28, %r15d
-; AVX512BW-NEXT:    orl %r12d, %r15d
-; AVX512BW-NEXT:    movl %r14d, %r12d
-; AVX512BW-NEXT:    shll $29, %r12d
-; AVX512BW-NEXT:    orl %r15d, %r12d
-; AVX512BW-NEXT:    kmovd %k1, %r15d
 ; AVX512BW-NEXT:    kshiftrd $1, %k0, %k1
-; AVX512BW-NEXT:    shll $30, %r14d
-; AVX512BW-NEXT:    orl %r12d, %r14d
-; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    movw $-3, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k6
+; AVX512BW-NEXT:    kandw %k6, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k2
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-5, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k4
+; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-9, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k4
+; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-17, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k4
+; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    movw $-33, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-65, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-129, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512BW-NEXT:    kmovd %eax, %k7
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k3
 ; AVX512BW-NEXT:    kshiftrd $2, %k0, %k1
-; AVX512BW-NEXT:    shll $31, %eax
-; AVX512BW-NEXT:    orl %r14d, %eax
-; AVX512BW-NEXT:    orl %ebp, %eax
-; AVX512BW-NEXT:    movl %eax, 16(%rsi)
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r14
-; AVX512BW-NEXT:    leaq (%r14,%rax,4), %r14
-; AVX512BW-NEXT:    leaq (%r14,%rax,8), %r14
-; AVX512BW-NEXT:    shlq $4, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movzbl %r12b, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %r14
-; AVX512BW-NEXT:    shlq $5, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrd $3, %k0, %k1
-; AVX512BW-NEXT:    shlq $9, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $13, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrd $4, %k0, %k1
-; AVX512BW-NEXT:    shlq $14, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $15, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $16, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $17, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $18, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrd $5, %k0, %k1
-; AVX512BW-NEXT:    shlq $19, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $21, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $23, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrd $12, %k0, %k1
-; AVX512BW-NEXT:    shlq $24, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $25, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $26, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $27, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $28, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    shlq $29, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movzbl %r15b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $30, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrd $7, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $32, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $8, %k0, %k1
-; AVX512BW-NEXT:    shlq $34, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $36, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $37, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $38, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $9, %k0, %k1
-; AVX512BW-NEXT:    shlq $39, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $43, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $10, %k0, %k1
-; AVX512BW-NEXT:    shlq $44, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $45, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $46, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $47, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $48, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $11, %k0, %k1
-; AVX512BW-NEXT:    shlq $49, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $51, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $53, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrd $20, %k0, %k1
-; AVX512BW-NEXT:    shlq $54, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $55, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    shlq $56, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $57, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    shlq $58, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    shlq $59, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movl %ebp, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $60, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $61, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrd $21, %k0, %k1
-; AVX512BW-NEXT:    movzbl %bpl, %ebp
-; AVX512BW-NEXT:    shlq $62, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movq %rbp, %r12
-; AVX512BW-NEXT:    shlq $63, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $22, %k0, %k1
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $23, %k0, %k1
-; AVX512BW-NEXT:    movq %r12, (%rsi)
-; AVX512BW-NEXT:    movzbl %bl, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    leaq (%rbp,%rbx,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%rbx,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%rbx,8), %rax
-; AVX512BW-NEXT:    movq %rbx, %r12
-; AVX512BW-NEXT:    shlq $4, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    shlq $5, %rbx
-; AVX512BW-NEXT:    orq %r12, %rbx
-; AVX512BW-NEXT:    movzbl %r11b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r11
-; AVX512BW-NEXT:    shlq $6, %r11
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %r11, %r12
-; AVX512BW-NEXT:    movq %rax, %r11
-; AVX512BW-NEXT:    shlq $8, %r11
-; AVX512BW-NEXT:    orq %r12, %r11
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %r11, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrd $24, %k0, %k0
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %r10b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $11, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $13, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    kmovd %k0, %r10d
-; AVX512BW-NEXT:    shlq $15, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %r8b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $16, %r8
-; AVX512BW-NEXT:    orq %r12, %r8
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $17, %r12
-; AVX512BW-NEXT:    orq %r8, %r12
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $18, %r8
-; AVX512BW-NEXT:    orq %r12, %r8
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $19, %r12
-; AVX512BW-NEXT:    orq %r8, %r12
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %dil, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $21, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $23, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    shlq $25, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $26, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $27, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $28, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $29, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $31, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $33, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    shlq $35, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r13, %rdx
-; AVX512BW-NEXT:    shlq $37, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r13, %rdx
-; AVX512BW-NEXT:    shlq $39, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $40, %r13
-; AVX512BW-NEXT:    orq %rdx, %r13
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $41, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %rdx
-; AVX512BW-NEXT:    shlq $42, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $43, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r15, %rdx
-; AVX512BW-NEXT:    shlq $44, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $45, %r15
-; AVX512BW-NEXT:    orq %rdx, %r15
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r14, %rdx
-; AVX512BW-NEXT:    shlq $47, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r14, %rdx
-; AVX512BW-NEXT:    shlq $49, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $50, %r14
-; AVX512BW-NEXT:    orq %rdx, %r14
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $51, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r11, %rdx
-; AVX512BW-NEXT:    shlq $52, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $53, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r11, %rdx
-; AVX512BW-NEXT:    shlq $54, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $55, %r11
-; AVX512BW-NEXT:    orq %rdx, %r11
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r10, %rdx
-; AVX512BW-NEXT:    shlq $57, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r10, %rdx
-; AVX512BW-NEXT:    shlq $59, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $60, %r10
-; AVX512BW-NEXT:    orq %rdx, %r10
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $61, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    shlq $62, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    movzbl %cl, %eax
-; AVX512BW-NEXT:    shlq $63, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %rax, 8(%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512BW-NEXT:    kmovd %eax, %k4
+; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512BW-NEXT:    kmovd %eax, %k4
+; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512BW-NEXT:    kmovd %eax, %k4
+; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
+; AVX512BW-NEXT:    kmovd %eax, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $3, %k0, %k2
+; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $29, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
+; AVX512BW-NEXT:    kshiftrd $28, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kandw %k6, %k1, %k3
+; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $30, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $31, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k4
+; AVX512BW-NEXT:    kshiftrw $4, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $2, %k4, %k7
+; AVX512BW-NEXT:    korw %k7, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm1 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrd $25, %k0, %k2
+; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kandw %k6, %k2, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrd $26, %k0, %k7
+; AVX512BW-NEXT:    kmovq %k0, %k4
+; AVX512BW-NEXT:    kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512BW-NEXT:    kshiftrw $13, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrd $27, %k4, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $8, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512BW-NEXT:    korw %k1, %k3, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $22, %k0, %k3
+; AVX512BW-NEXT:    kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k3, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $13, %k3, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k5, %k6, %k6
+; AVX512BW-NEXT:    kshiftrd $23, %k0, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512BW-NEXT:    kshiftrw $12, %k7, %k5
+; AVX512BW-NEXT:    korw %k5, %k6, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $11, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $10, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $9, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $8, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrd $24, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $2, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k4, %k5, %k5
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k5, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    korw %k7, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm3 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrd $19, %k0, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k6
+; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k4, %k4
+; AVX512BW-NEXT:    kshiftrd $20, %k0, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $11, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $10, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $9, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $8, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $7, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrd $21, %k0, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $6, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kandw %k3, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $5, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $4, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $3, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $2, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k4, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm4 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512BW-NEXT:    kandw %k7, %k1, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k3, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $17, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $18, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k4
+; AVX512BW-NEXT:    kshiftrw $5, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $13, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $12, %k0, %k3
+; AVX512BW-NEXT:    kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k2
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $14, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $15, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k4
+; AVX512BW-NEXT:    kshiftrw $4, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm6 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $9, %k0, %k2
+; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrd $10, %k0, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $13, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $12, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kandw %k7, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $11, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $10, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $9, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrd $11, %k0, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $8, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $7, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $6, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $5, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $4, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kandw %k3, %k4, %k4
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $15, %k7, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $2, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k4, %k4
+; AVX512BW-NEXT:    kshiftlw $14, %k7, %k3
+; AVX512BW-NEXT:    korw %k3, %k4, %k3
+; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm7 {%k3} {z}
+; AVX512BW-NEXT:    kshiftrd $6, %k0, %k4
+; AVX512BW-NEXT:    kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k4, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $13, %k4, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrd $7, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k5, %k5
+; AVX512BW-NEXT:    kshiftrd $8, %k0, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k5, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm8 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $14, %k5, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k5, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k5, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $4, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $5, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k1, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm9 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 512(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 576(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <32 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
-  store <160 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %data = call <160 x i32> @llvm.masked.load.v160i32.p0(ptr %in.vec, i32 64, <160 x i1> %tgt.mask, <160 x i32> poison)
+  store <160 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor5_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf64:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r13
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
-; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k3
-; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k2
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k2, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k2, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r15,2), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%r15,4), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%r15,8), %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r11
-; AVX512F-ONLY-NEXT:    shlq $5, %r11
-; AVX512F-ONLY-NEXT:    orq %r15, %r11
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $6, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $7, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $8, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $9, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $10, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $11, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $12, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $13, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $14, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $15, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $16, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $17, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $18, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $19, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $20, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $21, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $22, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $23, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $24, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $25, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $26, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $27, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $28, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $29, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $30, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $31, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r13
-; AVX512F-ONLY-NEXT:    shlq $33, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $34, %r10
-; AVX512F-ONLY-NEXT:    orq %r13, %r10
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $36, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $37, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $38, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $39, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r13
-; AVX512F-ONLY-NEXT:    shlq $43, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $44, %r10
-; AVX512F-ONLY-NEXT:    orq %r13, %r10
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $45, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $46, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $48, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $49, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r13
-; AVX512F-ONLY-NEXT:    shlq $53, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $54, %r10
-; AVX512F-ONLY-NEXT:    orq %r13, %r10
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $56, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $58, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $59, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r10d
-; AVX512F-ONLY-NEXT:    movl %r15d, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $60, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $62, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $63, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    orq %r11, %r12
-; AVX512F-ONLY-NEXT:    movq %r12, (%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r11d
-; AVX512F-ONLY-NEXT:    movl %r11d, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,8), %rax
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $4, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $5, %r14
-; AVX512F-ONLY-NEXT:    orq %r15, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $7, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $8, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $9, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $10, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $11, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $12, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $13, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $15, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $17, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $18, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $19, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $20, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $21, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $22, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $23, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $24, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $25, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $26, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $27, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $28, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $29, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $30, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %r13
-; AVX512F-ONLY-NEXT:    shlq $32, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $33, %rbx
-; AVX512F-ONLY-NEXT:    orq %r13, %rbx
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rbx
-; AVX512F-ONLY-NEXT:    shlq $35, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $37, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $38, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $39, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r12
-; AVX512F-ONLY-NEXT:    shlq $40, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $41, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r13
-; AVX512F-ONLY-NEXT:    shlq $42, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $43, %rbx
-; AVX512F-ONLY-NEXT:    orq %r13, %rbx
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rbx
-; AVX512F-ONLY-NEXT:    shlq $45, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rbx
-; AVX512F-ONLY-NEXT:    shlq $47, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $48, %r12
-; AVX512F-ONLY-NEXT:    orq %rbx, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rbx
-; AVX512F-ONLY-NEXT:    shlq $50, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $51, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $52, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k4
-; AVX512F-ONLY-NEXT:    shlq $53, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r12
-; AVX512F-ONLY-NEXT:    shlq $55, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r12
-; AVX512F-ONLY-NEXT:    shlq $57, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $58, %rbx
-; AVX512F-ONLY-NEXT:    orq %r12, %rbx
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
-; AVX512F-ONLY-NEXT:    # kill: def $r15d killed $r15d def $r15
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $59, %r12
-; AVX512F-ONLY-NEXT:    orq %rbx, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rbx
-; AVX512F-ONLY-NEXT:    shlq $60, %rbx
-; AVX512F-ONLY-NEXT:    orq %r12, %rbx
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %rbx, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k4
-; AVX512F-ONLY-NEXT:    shlq $62, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k4
-; AVX512F-ONLY-NEXT:    shlq $63, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %rax, 32(%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movzbl %bl, %r14d
-; AVX512F-ONLY-NEXT:    movl %r14d, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r15,2), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,8), %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $5, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $7, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $9, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k4
-; AVX512F-ONLY-NEXT:    shlq $11, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $13, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $15, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k4
-; AVX512F-ONLY-NEXT:    shlq $16, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $17, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $18, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $19, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $20, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k4
-; AVX512F-ONLY-NEXT:    shlq $21, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $22, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $23, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $24, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $25, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $26, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $27, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $28, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $29, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $30, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $31, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $33, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $35, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $36, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $37, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $38, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $39, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $40, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $43, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $45, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $46, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $48, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $50, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $53, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $55, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $56, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $58, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $59, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $60, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    shlq $62, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $63, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k3, %k4
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k3, %k4
-; AVX512F-ONLY-NEXT:    movq %r12, 16(%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    leaq (%r13,%r13,2), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r13,4), %r15
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r13,8), %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $5, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $6, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $9, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $11, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $12, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $13, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $14, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $15, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $16, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $17, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $19, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $20, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $21, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $22, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $24, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $25, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $26, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $27, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $28, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $29, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $30, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $31, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $32, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $34, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $36, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k3, %k3
-; AVX512F-ONLY-NEXT:    shlq $37, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $39, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $41, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $42, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $43, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $44, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $45, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $46, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $47, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $49, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $51, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k2
-; AVX512F-ONLY-NEXT:    shlq $52, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $53, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $54, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $56, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512F-ONLY-NEXT:    shlq $57, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $58, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $59, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $61, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $62, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $63, %r11
-; AVX512F-ONLY-NEXT:    orq %r12, %r11
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512F-ONLY-NEXT:    orq %r15, %r11
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r11, 24(%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r11d
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    leaq (%r10,%r11,2), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r11,4), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r11,8), %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %r10
-; AVX512F-ONLY-NEXT:    shlq $4, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $5, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $6, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $7, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $8, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %rbp
-; AVX512F-ONLY-NEXT:    shlq $9, %rbp
-; AVX512F-ONLY-NEXT:    orq %r9, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k0
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %rbp, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $11, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $13, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r8d
-; AVX512F-ONLY-NEXT:    shlq $15, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $16, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $17, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $18, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $19, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    shlq $20, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r9
-; AVX512F-ONLY-NEXT:    shlq $21, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $22, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %r9
-; AVX512F-ONLY-NEXT:    shlq $23, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $24, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $25, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    movzbl %dl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $26, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $27, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $28, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $29, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $30, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movzbl %cl, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $31, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $33, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $37, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $39, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $40, %r13
-; AVX512F-ONLY-NEXT:    orq %rcx, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $41, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $42, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $43, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $44, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $45, %r12
-; AVX512F-ONLY-NEXT:    orq %rcx, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $47, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $49, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $50, %r15
-; AVX512F-ONLY-NEXT:    orq %rcx, %r15
-; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $51, %rcx
-; AVX512F-ONLY-NEXT:    orq %r15, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $52, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $53, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $54, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %rcx
-; AVX512F-ONLY-NEXT:    shlq $56, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %rcx
-; AVX512F-ONLY-NEXT:    shlq $58, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $59, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $61, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $62, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $63, %r14
-; AVX512F-ONLY-NEXT:    orq %rbx, %r14
-; AVX512F-ONLY-NEXT:    orq %r11, %r14
-; AVX512F-ONLY-NEXT:    movq %r14, 8(%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r13
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm8
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm9, %zmm10
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm11, %zmm12
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm3
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm2, %zmm13
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm7, %zmm14
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm9, %zmm15
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm11, %zmm16
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm1, %zmm4
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm2, %zmm17
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm7, %zmm18
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm9, %zmm19
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm11, %zmm5
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm7
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm9
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm0
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm11 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm19, %zmm19, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm19 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm18, %zmm18, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm17, %zmm17, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm17 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm16, %zmm16, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm16 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm14 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm13 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1024(%rsi), %zmm12 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1088(%rsi), %zmm10 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1152(%rsi), %zmm8 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1216(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 1216(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 1152(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 1088(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 1024(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 960(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 896(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 832(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, 768(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm16, 704(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 640(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm17, 576(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm18, 512(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm19, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor5_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r13
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
-; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm4
 ; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
-; AVX512DQ-NEXT:    kmovw 4(%rdi), %k3
-; AVX512DQ-NEXT:    kmovw 6(%rdi), %k2
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ecx
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %edx
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %edi
-; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r8d
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r9d
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ebx
-; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r14d
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r10d
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r11d
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k1, %r15d
-; AVX512DQ-NEXT:    movzbl %r15b, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    leaq (%r15,%r15,2), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%r15,4), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%r15,8), %r12
-; AVX512DQ-NEXT:    shlq $4, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %r11b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r11
-; AVX512DQ-NEXT:    shlq $5, %r11
-; AVX512DQ-NEXT:    orq %r15, %r11
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $6, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $7, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $8, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k4
-; AVX512DQ-NEXT:    shlq $9, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %r13b, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $10, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $11, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $12, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $13, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k4
-; AVX512DQ-NEXT:    shlq $14, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %r13b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $15, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $16, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $17, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $18, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k4
-; AVX512DQ-NEXT:    shlq $19, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %r13b, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $20, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $21, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $22, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $23, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k4
-; AVX512DQ-NEXT:    shlq $24, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %r13b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $25, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $26, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $27, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $28, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    shlq $29, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %r10b, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $30, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k4
-; AVX512DQ-NEXT:    shlq $31, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r10, %r13
-; AVX512DQ-NEXT:    shlq $33, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k4
-; AVX512DQ-NEXT:    shlq $34, %r10
-; AVX512DQ-NEXT:    orq %r13, %r10
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $36, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $37, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $38, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r10d
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k4
-; AVX512DQ-NEXT:    shlq $39, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r13
-; AVX512DQ-NEXT:    shlq $43, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k4
-; AVX512DQ-NEXT:    shlq $44, %r10
-; AVX512DQ-NEXT:    orq %r13, %r10
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $45, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $46, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $48, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r10d
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k4
-; AVX512DQ-NEXT:    shlq $49, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r13
-; AVX512DQ-NEXT:    shlq $53, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k4
-; AVX512DQ-NEXT:    shlq $54, %r10
-; AVX512DQ-NEXT:    orq %r13, %r10
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $56, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $58, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    shlq $59, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    movzbl %r15b, %r10d
-; AVX512DQ-NEXT:    movl %r15d, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $60, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k4
-; AVX512DQ-NEXT:    shlq $62, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $63, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    orq %r11, %r12
-; AVX512DQ-NEXT:    movq %r12, (%rsi)
-; AVX512DQ-NEXT:    movzbl %r15b, %r11d
-; AVX512DQ-NEXT:    movl %r11d, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%rax,8), %rax
-; AVX512DQ-NEXT:    movzbl %r14b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $4, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $5, %r14
-; AVX512DQ-NEXT:    orq %r15, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $7, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k4
-; AVX512DQ-NEXT:    shlq $8, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $9, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $10, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $11, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $12, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k4
-; AVX512DQ-NEXT:    shlq $13, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $15, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $17, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k4
-; AVX512DQ-NEXT:    shlq $18, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $19, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $20, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $21, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $22, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k4
-; AVX512DQ-NEXT:    shlq $23, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $24, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $25, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $26, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $27, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    shlq $28, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %bl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $29, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $30, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k4
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %r13
-; AVX512DQ-NEXT:    shlq $32, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k4
-; AVX512DQ-NEXT:    shlq $33, %rbx
-; AVX512DQ-NEXT:    orq %r13, %rbx
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rbx
-; AVX512DQ-NEXT:    shlq $35, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $37, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %ebx
-; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k4
-; AVX512DQ-NEXT:    shlq $38, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $39, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r12
-; AVX512DQ-NEXT:    shlq $40, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $41, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r13
-; AVX512DQ-NEXT:    shlq $42, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k4
-; AVX512DQ-NEXT:    shlq $43, %rbx
-; AVX512DQ-NEXT:    orq %r13, %rbx
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rbx
-; AVX512DQ-NEXT:    shlq $45, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rbx
-; AVX512DQ-NEXT:    shlq $47, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k4
-; AVX512DQ-NEXT:    shlq $48, %r12
-; AVX512DQ-NEXT:    orq %rbx, %r12
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $49, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %rbx
-; AVX512DQ-NEXT:    shlq $50, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $51, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $52, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %ebx
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k4
-; AVX512DQ-NEXT:    shlq $53, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r12
-; AVX512DQ-NEXT:    shlq $55, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r12
-; AVX512DQ-NEXT:    shlq $57, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $58, %rbx
-; AVX512DQ-NEXT:    orq %r12, %rbx
-; AVX512DQ-NEXT:    movzbl %r15b, %eax
-; AVX512DQ-NEXT:    # kill: def $r15d killed $r15d def $r15
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $59, %r12
-; AVX512DQ-NEXT:    orq %rbx, %r12
-; AVX512DQ-NEXT:    movq %r15, %rbx
-; AVX512DQ-NEXT:    shlq $60, %rbx
-; AVX512DQ-NEXT:    orq %r12, %rbx
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %rbx, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %ebx
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k4
-; AVX512DQ-NEXT:    shlq $62, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k4
-; AVX512DQ-NEXT:    shlq $63, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %rax, 32(%rsi)
-; AVX512DQ-NEXT:    movzbl %r12b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movzbl %bl, %r14d
-; AVX512DQ-NEXT:    movl %r14d, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    leaq (%r15,%r15,2), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%rax,8), %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $5, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $7, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $9, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k4
-; AVX512DQ-NEXT:    shlq $11, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $13, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $15, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k4
-; AVX512DQ-NEXT:    shlq $16, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $17, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $18, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $19, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $20, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k4
-; AVX512DQ-NEXT:    shlq $21, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $22, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $23, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $24, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $25, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $1, %k3, %k4
-; AVX512DQ-NEXT:    shlq $26, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $27, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $28, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $29, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $30, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    shlq $31, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $33, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $35, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512DQ-NEXT:    shlq $36, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $37, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $38, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $39, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $40, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k4
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $43, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $45, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $4, %k3, %k4
-; AVX512DQ-NEXT:    shlq $46, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $48, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $49, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $50, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $5, %k3, %k4
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $53, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $55, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $6, %k3, %k4
-; AVX512DQ-NEXT:    shlq $56, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $58, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $59, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $60, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $8, %k3, %k4
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    shlq $62, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $63, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $7, %k3, %k4
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $9, %k3, %k4
-; AVX512DQ-NEXT:    movq %r12, 16(%rsi)
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    leaq (%r13,%r13,2), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%r13,4), %r15
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    leaq (%r15,%r13,8), %r15
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $5, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $6, %r12
-; AVX512DQ-NEXT:    shlq $7, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $9, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $11, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $10, %k3, %k4
-; AVX512DQ-NEXT:    shlq $12, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $13, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $14, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $15, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $16, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $11, %k3, %k4
-; AVX512DQ-NEXT:    shlq $17, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $19, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $20, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $21, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $12, %k3, %k4
-; AVX512DQ-NEXT:    shlq $22, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $24, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $25, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $26, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k4
-; AVX512DQ-NEXT:    shlq $27, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $28, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $29, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $30, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $31, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $14, %k3, %k4
-; AVX512DQ-NEXT:    shlq $32, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $34, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $36, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $15, %k3, %k3
-; AVX512DQ-NEXT:    shlq $37, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $39, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $41, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512DQ-NEXT:    shlq $42, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $43, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $44, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $45, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $46, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $47, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $49, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $51, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k3, %r13d
-; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k2
-; AVX512DQ-NEXT:    shlq $52, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $53, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $54, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $56, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512DQ-NEXT:    shlq $57, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $58, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $59, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $60, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $61, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %ebp
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512DQ-NEXT:    shlq $62, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k1, %r13d
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512DQ-NEXT:    shlq $63, %r11
-; AVX512DQ-NEXT:    orq %r12, %r11
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512DQ-NEXT:    orq %r15, %r11
-; AVX512DQ-NEXT:    kmovw %k1, %r15d
-; AVX512DQ-NEXT:    movq %r11, 24(%rsi)
-; AVX512DQ-NEXT:    movzbl %bpl, %r11d
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    leaq (%r10,%r11,2), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%r11,4), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%r11,8), %rax
-; AVX512DQ-NEXT:    movq %r11, %r10
-; AVX512DQ-NEXT:    shlq $4, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    shlq $5, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movzbl %r9b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $6, %r9
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $7, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $8, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %rax, %rbp
-; AVX512DQ-NEXT:    shlq $9, %rbp
-; AVX512DQ-NEXT:    orq %r9, %rbp
-; AVX512DQ-NEXT:    kmovw %k0, %r10d
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %rbp, %rax
-; AVX512DQ-NEXT:    movzbl %r8b, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %r8
-; AVX512DQ-NEXT:    shlq $11, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %r9, %r8
-; AVX512DQ-NEXT:    shlq $13, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    kmovw %k0, %r8d
-; AVX512DQ-NEXT:    shlq $15, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movzbl %r10b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $16, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $17, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $18, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $19, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    shlq $20, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movzbl %dil, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r9
-; AVX512DQ-NEXT:    shlq $21, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $22, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %rdi, %r9
-; AVX512DQ-NEXT:    shlq $23, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $24, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    shlq $25, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    movzbl %dl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $26, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $27, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $28, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $29, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $30, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movzbl %cl, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $31, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $33, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $35, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $37, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $39, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $40, %r13
-; AVX512DQ-NEXT:    orq %rcx, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $41, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $42, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $43, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $44, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $45, %r12
-; AVX512DQ-NEXT:    orq %rcx, %r12
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $47, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $49, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $50, %r15
-; AVX512DQ-NEXT:    orq %rcx, %r15
-; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $51, %rcx
-; AVX512DQ-NEXT:    orq %r15, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $52, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $53, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $54, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %rcx
-; AVX512DQ-NEXT:    shlq $56, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r8, %rcx
-; AVX512DQ-NEXT:    shlq $58, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $59, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $60, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $61, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    shlq $62, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $63, %r14
-; AVX512DQ-NEXT:    orq %rbx, %r14
-; AVX512DQ-NEXT:    orq %r11, %r14
-; AVX512DQ-NEXT:    movq %r14, 8(%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r13
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm5
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm9, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm11, %zmm12
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm3
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm2, %zmm13
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm7, %zmm14
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm9, %zmm15
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm11, %zmm16
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm1, %zmm4
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm2, %zmm17
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm7, %zmm18
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm9, %zmm19
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm11, %zmm5
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm0
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm11 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm19, %k1
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm19 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm18, %k1
+; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm17, %k1
+; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm17 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm16, %k1
+; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm16 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
+; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
+; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm14 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
+; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm13 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1024(%rsi), %zmm12 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1088(%rsi), %zmm10 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1152(%rsi), %zmm8 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1216(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 1216(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 1152(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 1088(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 1024(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 960(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 896(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 832(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 768(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 704(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 640(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 576(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 512(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor5_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovq (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrq $19, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    kshiftrq $18, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrq $17, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrq $16, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrq $15, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrq $14, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrq $27, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $57, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrq $52, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrq $1, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrq $2, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k0, %r15d
-; AVX512BW-NEXT:    movzbl %r15b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    leaq (%r15,%r15,2), %r12
-; AVX512BW-NEXT:    leaq (%r12,%r15,4), %r12
-; AVX512BW-NEXT:    leaq (%r12,%r15,8), %r12
-; AVX512BW-NEXT:    shlq $4, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %r11b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r11
-; AVX512BW-NEXT:    shlq $5, %r11
-; AVX512BW-NEXT:    orq %r15, %r11
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $6, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $7, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $8, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $3, %k0, %k1
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %r13b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $10, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $11, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $12, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $13, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $4, %k0, %k1
-; AVX512BW-NEXT:    shlq $14, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %r13b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $15, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $16, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $17, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $18, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $5, %k0, %k1
-; AVX512BW-NEXT:    shlq $19, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %r13b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $20, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $21, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $22, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $23, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $12, %k0, %k1
-; AVX512BW-NEXT:    shlq $24, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %r13b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $25, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $26, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $27, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $28, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    shlq $29, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %r10b, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $7, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r10, %r13
-; AVX512BW-NEXT:    shlq $33, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $8, %k0, %k1
-; AVX512BW-NEXT:    shlq $34, %r10
-; AVX512BW-NEXT:    orq %r13, %r10
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $36, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $37, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $38, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrq $9, %k0, %k1
-; AVX512BW-NEXT:    shlq $39, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r10, %r13
-; AVX512BW-NEXT:    shlq $43, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $10, %k0, %k1
-; AVX512BW-NEXT:    shlq $44, %r10
-; AVX512BW-NEXT:    orq %r13, %r10
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $45, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $46, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $47, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $48, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrq $11, %k0, %k1
-; AVX512BW-NEXT:    shlq $49, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $51, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $53, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $51, %k0, %k1
-; AVX512BW-NEXT:    shlq $54, %r10
-; AVX512BW-NEXT:    orq %r12, %r10
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $55, %r12
-; AVX512BW-NEXT:    orq %r10, %r12
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    shlq $56, %r10
-; AVX512BW-NEXT:    orq %r12, %r10
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $57, %r12
-; AVX512BW-NEXT:    orq %r10, %r12
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    shlq $58, %r10
-; AVX512BW-NEXT:    orq %r12, %r10
-; AVX512BW-NEXT:    shlq $59, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movl %r15d, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $60, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $61, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $53, %k0, %k1
-; AVX512BW-NEXT:    movzbl %r15b, %r10d
-; AVX512BW-NEXT:    shlq $62, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $63, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %rax, (%rsi)
-; AVX512BW-NEXT:    movzbl %r13b, %r11d
-; AVX512BW-NEXT:    movl %r11d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r15
-; AVX512BW-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512BW-NEXT:    leaq (%r15,%rax,8), %rax
-; AVX512BW-NEXT:    movzbl %r14b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $4, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $5, %r14
-; AVX512BW-NEXT:    orq %r15, %r14
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $7, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $54, %k0, %k1
-; AVX512BW-NEXT:    shlq $8, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %al, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $9, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $10, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $11, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $12, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $55, %k0, %k1
-; AVX512BW-NEXT:    shlq $13, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $15, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $17, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $56, %k0, %k1
-; AVX512BW-NEXT:    shlq $18, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %al, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $19, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $20, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $21, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $22, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $63, %k0, %k1
-; AVX512BW-NEXT:    shlq $23, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $25, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $27, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    shlq $28, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %bl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    shlq $29, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $30, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $58, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $32, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $59, %k0, %k1
-; AVX512BW-NEXT:    shlq $33, %rbx
-; AVX512BW-NEXT:    orq %r13, %rbx
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %rbx
-; AVX512BW-NEXT:    shlq $35, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $37, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrq $60, %k0, %k1
-; AVX512BW-NEXT:    shlq $38, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $39, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbx, %r12
-; AVX512BW-NEXT:    shlq $40, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $41, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $42, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $61, %k0, %k1
-; AVX512BW-NEXT:    shlq $43, %rbx
-; AVX512BW-NEXT:    orq %r13, %rbx
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %rbx
-; AVX512BW-NEXT:    shlq $45, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $47, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrq $62, %k0, %k1
-; AVX512BW-NEXT:    shlq $48, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $49, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbx, %r12
-; AVX512BW-NEXT:    shlq $50, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $51, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbx, %r12
-; AVX512BW-NEXT:    shlq $52, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $25, %k0, %k1
-; AVX512BW-NEXT:    shlq $53, %rbx
-; AVX512BW-NEXT:    orq %r12, %rbx
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $54, %r12
-; AVX512BW-NEXT:    orq %rbx, %r12
-; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    shlq $55, %rbx
-; AVX512BW-NEXT:    orq %r12, %rbx
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $56, %r12
-; AVX512BW-NEXT:    orq %rbx, %r12
-; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    shlq $57, %rbx
-; AVX512BW-NEXT:    orq %r12, %rbx
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movl %r15d, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rbx
-; AVX512BW-NEXT:    shlq $59, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrq $26, %k0, %k1
-; AVX512BW-NEXT:    shlq $62, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $28, %k0, %k1
-; AVX512BW-NEXT:    movzbl %r15b, %r15d
-; AVX512BW-NEXT:    shlq $63, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    movq %r15, 32(%rsi)
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movzbl %bl, %r14d
-; AVX512BW-NEXT:    movl %r14d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r13,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r13,8), %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $4, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $5, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    shlq $6, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $7, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $8, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $9, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $10, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $29, %k0, %k1
-; AVX512BW-NEXT:    shlq $11, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $13, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $15, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $30, %k0, %k1
-; AVX512BW-NEXT:    shlq $16, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $17, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $18, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $19, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $20, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $31, %k0, %k1
-; AVX512BW-NEXT:    shlq $21, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $23, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $25, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512BW-NEXT:    shlq $26, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $27, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $28, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $29, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %rbp
-; AVX512BW-NEXT:    shlq $30, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $33, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %r12
-; AVX512BW-NEXT:    orq %rbp, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $33, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $35, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $34, %k0, %k1
-; AVX512BW-NEXT:    shlq $36, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $37, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $38, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $39, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %rbp
-; AVX512BW-NEXT:    shlq $40, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $35, %k0, %k1
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %rbp, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $43, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $45, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $36, %k0, %k1
-; AVX512BW-NEXT:    shlq $46, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $47, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $48, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $49, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %rbp
-; AVX512BW-NEXT:    shlq $50, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $37, %k0, %k1
-; AVX512BW-NEXT:    shlq $51, %r12
-; AVX512BW-NEXT:    orq %rbp, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $53, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $55, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $38, %k0, %k1
-; AVX512BW-NEXT:    shlq $56, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $57, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $58, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $59, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $60, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $40, %k0, %k1
-; AVX512BW-NEXT:    shlq $61, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    shlq $62, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $63, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $39, %k0, %k1
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $41, %k0, %k1
-; AVX512BW-NEXT:    movq %r12, 16(%rsi)
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    leaq (%r13,%r13,2), %r15
-; AVX512BW-NEXT:    leaq (%r15,%r13,4), %r15
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    leaq (%r15,%r13,8), %r15
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $4, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $5, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $6, %r12
-; AVX512BW-NEXT:    shlq $7, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $9, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $42, %k0, %k1
-; AVX512BW-NEXT:    shlq $12, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $13, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $14, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $15, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $16, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $43, %k0, %k1
-; AVX512BW-NEXT:    shlq $17, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $19, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $21, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $44, %k0, %k1
-; AVX512BW-NEXT:    shlq $22, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $23, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $24, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $25, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $26, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $45, %k0, %k1
-; AVX512BW-NEXT:    shlq $27, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $28, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $29, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %rbp
-; AVX512BW-NEXT:    shlq $31, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $46, %k0, %k1
-; AVX512BW-NEXT:    shlq $32, %r12
-; AVX512BW-NEXT:    orq %rbp, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $34, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $36, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $47, %k0, %k1
-; AVX512BW-NEXT:    shlq $37, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $39, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $41, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $48, %k0, %k1
-; AVX512BW-NEXT:    shlq $42, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $43, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $44, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $45, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $46, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $49, %k0, %k1
-; AVX512BW-NEXT:    shlq $47, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $49, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %rbp
-; AVX512BW-NEXT:    shlq $51, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $50, %k0, %k1
-; AVX512BW-NEXT:    shlq $52, %r12
-; AVX512BW-NEXT:    orq %rbp, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $53, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $54, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $55, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $56, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $13, %k0, %k1
-; AVX512BW-NEXT:    shlq $57, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $59, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $20, %k0, %k1
-; AVX512BW-NEXT:    shlq $62, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $21, %k0, %k1
-; AVX512BW-NEXT:    shlq $63, %r11
-; AVX512BW-NEXT:    orq %r12, %r11
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $22, %k0, %k1
-; AVX512BW-NEXT:    orq %r15, %r11
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $23, %k0, %k1
-; AVX512BW-NEXT:    movq %r11, 24(%rsi)
-; AVX512BW-NEXT:    movzbl %bpl, %r11d
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    leaq (%r10,%r11,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r11,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r11,8), %rax
-; AVX512BW-NEXT:    movq %r11, %r10
-; AVX512BW-NEXT:    shlq $4, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    shlq $5, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movzbl %r9b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $6, %r9
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    shlq $7, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $8, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    shlq $9, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrq $24, %k0, %k0
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movzbl %r8b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %r8
-; AVX512BW-NEXT:    shlq $11, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %r10, %r8
-; AVX512BW-NEXT:    shlq $13, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    kmovd %k0, %r8d
-; AVX512BW-NEXT:    shlq $15, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movzbl %dil, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $16, %rdi
-; AVX512BW-NEXT:    orq %r10, %rdi
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    shlq $17, %r10
-; AVX512BW-NEXT:    orq %rdi, %r10
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $18, %rdi
-; AVX512BW-NEXT:    orq %r10, %rdi
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    shlq $19, %r10
-; AVX512BW-NEXT:    orq %rdi, %r10
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movzbl %dl, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $21, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $23, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    shlq $25, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movzbl %cl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $26, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $27, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $28, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $29, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $31, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $33, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    shlq $35, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $37, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $39, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $40, %r13
-; AVX512BW-NEXT:    orq %rcx, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $41, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $42, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $43, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $44, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $45, %r12
-; AVX512BW-NEXT:    orq %rcx, %r12
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $47, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $49, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $50, %r15
-; AVX512BW-NEXT:    orq %rcx, %r15
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $51, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r9, %rcx
-; AVX512BW-NEXT:    shlq $52, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $53, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r9, %rcx
-; AVX512BW-NEXT:    shlq $54, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $55, %r9
-; AVX512BW-NEXT:    orq %rcx, %r9
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r8, %rcx
-; AVX512BW-NEXT:    shlq $57, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r8, %rcx
-; AVX512BW-NEXT:    shlq $59, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $60, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $61, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    shlq $62, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    shlq $63, %r14
-; AVX512BW-NEXT:    orq %rbx, %r14
-; AVX512BW-NEXT:    orq %r11, %r14
-; AVX512BW-NEXT:    movq %r14, 8(%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    kmovq (%rdi), %k5
+; AVX512BW-NEXT:    kshiftrq $1, %k5, %k0
+; AVX512BW-NEXT:    movw $-3, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k5, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k2
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-5, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-9, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-17, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    movw $-33, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-65, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-129, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k2, %k0
+; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k0, %k3
+; AVX512BW-NEXT:    kshiftrq $2, %k5, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k2, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k7
+; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k7, %k7
+; AVX512BW-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k7, %k7
+; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
+; AVX512BW-NEXT:    kmovd %eax, %k6
+; AVX512BW-NEXT:    kandw %k6, %k7, %k7
+; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $3, %k5, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k7, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k7, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $4, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $5, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $6, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k6
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k6} {z}
+; AVX512BW-NEXT:    kandw %k2, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k7, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k7, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $7, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $8, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $9, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $10, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $11, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $12, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k6
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k6} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrq $13, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $14, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $15, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $16, %k5, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $17, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $18, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $19, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $20, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $21, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $22, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k7} {z}
+; AVX512BW-NEXT:    kandw %k4, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $23, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $24, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $25, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $26, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $27, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $28, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k6
+; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm8 {%k6} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrq $29, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $30, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $31, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k5, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $33, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $34, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $35, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $36, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $37, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $38, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm11 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $39, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $40, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $41, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm12 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $42, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $43, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $44, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k6
+; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm13 {%k6} {z}
+; AVX512BW-NEXT:    kandw %k2, %k1, %k0
+; AVX512BW-NEXT:    kshiftrq $45, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $46, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $47, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm14 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $48, %k5, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $49, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $50, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $51, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm15 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $52, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $53, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $54, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $55, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $56, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $57, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 1088(%rsi), %zmm17 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $58, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $59, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $60, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k6
+; AVX512BW-NEXT:    vmovdqa32 1152(%rsi), %zmm18 {%k6} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrq $61, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $62, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $63, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k4
+; AVX512BW-NEXT:    korw %k4, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k5, %k2
+; AVX512BW-NEXT:    korw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k1, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1216(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 1152(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 1088(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 960(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 896(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 832(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 768(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 704(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 640(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 512(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <64 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
-  store <320 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+  %data = call <320 x i32> @llvm.masked.load.v320i32.p0(ptr %in.vec, i32 64, <320 x i1> %tgt.mask, <320 x i32> poison)
+  store <320 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor6_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf2:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512F-ONLY-NEXT:    movl %eax, %edx
-; AVX512F-ONLY-NEXT:    shll $4, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $5, %eax
-; AVX512F-ONLY-NEXT:    orl %edx, %eax
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $6, %edi
-; AVX512F-ONLY-NEXT:    movl %edx, %r8d
-; AVX512F-ONLY-NEXT:    shll $7, %r8d
-; AVX512F-ONLY-NEXT:    orl %edi, %r8d
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $8, %edi
-; AVX512F-ONLY-NEXT:    orl %r8d, %edi
-; AVX512F-ONLY-NEXT:    movl %edx, %r8d
-; AVX512F-ONLY-NEXT:    shll $9, %r8d
-; AVX512F-ONLY-NEXT:    orl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $10, %edx
-; AVX512F-ONLY-NEXT:    orl %r8d, %edx
-; AVX512F-ONLY-NEXT:    shll $11, %ecx
-; AVX512F-ONLY-NEXT:    orl %edx, %ecx
-; AVX512F-ONLY-NEXT:    orl %eax, %ecx
-; AVX512F-ONLY-NEXT:    andl $4095, %ecx # imm = 0xFFF
-; AVX512F-ONLY-NEXT:    movw %cx, (%rsi)
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u>
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-ONLY-NEXT:    movw $4095, %ax # imm = 0xFFF
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor6_vf2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k1
-; AVX512DQ-NEXT:    kshiftrb $1, %k1, %k0
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512DQ-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512DQ-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512DQ-NEXT:    movl %eax, %edx
-; AVX512DQ-NEXT:    shll $4, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    shll $5, %eax
-; AVX512DQ-NEXT:    orl %edx, %eax
-; AVX512DQ-NEXT:    kmovw %k0, %ecx
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $6, %edi
-; AVX512DQ-NEXT:    movl %edx, %r8d
-; AVX512DQ-NEXT:    shll $7, %r8d
-; AVX512DQ-NEXT:    orl %edi, %r8d
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $8, %edi
-; AVX512DQ-NEXT:    orl %r8d, %edi
-; AVX512DQ-NEXT:    movl %edx, %r8d
-; AVX512DQ-NEXT:    shll $9, %r8d
-; AVX512DQ-NEXT:    orl %edi, %r8d
-; AVX512DQ-NEXT:    shll $10, %edx
-; AVX512DQ-NEXT:    orl %r8d, %edx
-; AVX512DQ-NEXT:    shll $11, %ecx
-; AVX512DQ-NEXT:    orl %edx, %ecx
-; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    andl $4095, %ecx # imm = 0xFFF
-; AVX512DQ-NEXT:    movw %cx, (%rsi)
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u>
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    movw $4095, %ax # imm = 0xFFF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor6_vf2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k1
-; AVX512BW-NEXT:    kshiftrw $1, %k1, %k0
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512BW-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512BW-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512BW-NEXT:    movl %eax, %edx
-; AVX512BW-NEXT:    shll $4, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    shll $5, %eax
-; AVX512BW-NEXT:    orl %edx, %eax
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $6, %edi
-; AVX512BW-NEXT:    movl %edx, %r8d
-; AVX512BW-NEXT:    shll $7, %r8d
-; AVX512BW-NEXT:    orl %edi, %r8d
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $8, %edi
-; AVX512BW-NEXT:    orl %r8d, %edi
-; AVX512BW-NEXT:    movl %edx, %r8d
-; AVX512BW-NEXT:    shll $9, %r8d
-; AVX512BW-NEXT:    orl %edi, %r8d
-; AVX512BW-NEXT:    shll $10, %edx
-; AVX512BW-NEXT:    orl %r8d, %edx
-; AVX512BW-NEXT:    shll $11, %ecx
-; AVX512BW-NEXT:    orl %edx, %ecx
-; AVX512BW-NEXT:    orl %eax, %ecx
-; AVX512BW-NEXT:    andl $4095, %ecx # imm = 0xFFF
-; AVX512BW-NEXT:    movw %cx, (%rsi)
+; AVX512BW-NEXT:    kmovq (%rdi), %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u>
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512BW-NEXT:    movw $4095, %ax # imm = 0xFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <2 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  store <12 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
+  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison)
+  %data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <12 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor6_vf4(ptr %in.vec, ptr %out.vec) nounwind {
-; AVX512F-ONLY-LABEL: mask_replication_factor6_vf4:
-; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k3
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512F-ONLY-NEXT:    movl %eax, %edx
-; AVX512F-ONLY-NEXT:    shll $4, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $5, %eax
-; AVX512F-ONLY-NEXT:    orl %edx, %eax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $6, %edx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edi
-; AVX512F-ONLY-NEXT:    shll $7, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $8, %edx
-; AVX512F-ONLY-NEXT:    orl %edi, %edx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edi
-; AVX512F-ONLY-NEXT:    shll $9, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $10, %edx
-; AVX512F-ONLY-NEXT:    orl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $11, %ecx
-; AVX512F-ONLY-NEXT:    orl %edx, %ecx
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $12, %edi
-; AVX512F-ONLY-NEXT:    orl %ecx, %edi
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $13, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $14, %edi
-; AVX512F-ONLY-NEXT:    orl %ecx, %edi
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $15, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    orl %eax, %ecx
-; AVX512F-ONLY-NEXT:    movw %cx, (%rsi)
-; AVX512F-ONLY-NEXT:    movl %edx, %eax
-; AVX512F-ONLY-NEXT:    shll $16, %eax
-; AVX512F-ONLY-NEXT:    shll $17, %edx
-; AVX512F-ONLY-NEXT:    orl %eax, %edx
-; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $18, %r8d
-; AVX512F-ONLY-NEXT:    orl %edx, %r8d
-; AVX512F-ONLY-NEXT:    movl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $19, %edx
-; AVX512F-ONLY-NEXT:    orl %r8d, %edx
-; AVX512F-ONLY-NEXT:    movl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $20, %r8d
-; AVX512F-ONLY-NEXT:    orl %edx, %r8d
-; AVX512F-ONLY-NEXT:    movl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $21, %edx
-; AVX512F-ONLY-NEXT:    orl %r8d, %edx
-; AVX512F-ONLY-NEXT:    shll $22, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $23, %eax
-; AVX512F-ONLY-NEXT:    orl %edi, %eax
-; AVX512F-ONLY-NEXT:    orl %ecx, %eax
-; AVX512F-ONLY-NEXT:    shrl $16, %eax
-; AVX512F-ONLY-NEXT:    movb %al, 2(%rsi)
-; AVX512F-ONLY-NEXT:    retq
+define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-SLOW-LABEL: mask_replication_factor6_vf4:
+; AVX512F-SLOW:       # %bb.0:
+; AVX512F-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512F-SLOW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; AVX512F-SLOW-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512F-SLOW-NEXT:    movw $255, %ax
+; AVX512F-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512F-SLOW-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512F-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-SLOW-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512F-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 64(%rdx)
+; AVX512F-SLOW-NEXT:    vzeroupper
+; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX512DQ-LABEL: mask_replication_factor6_vf4:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k3
-; AVX512DQ-NEXT:    kshiftrb $3, %k3, %k0
-; AVX512DQ-NEXT:    kshiftrb $2, %k3, %k1
-; AVX512DQ-NEXT:    kshiftrb $1, %k3, %k2
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512DQ-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512DQ-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512DQ-NEXT:    movl %eax, %edx
-; AVX512DQ-NEXT:    shll $4, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    shll $5, %eax
-; AVX512DQ-NEXT:    orl %edx, %eax
-; AVX512DQ-NEXT:    kmovw %k2, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    shll $6, %edx
-; AVX512DQ-NEXT:    movl %ecx, %edi
-; AVX512DQ-NEXT:    shll $7, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    shll $8, %edx
-; AVX512DQ-NEXT:    orl %edi, %edx
-; AVX512DQ-NEXT:    movl %ecx, %edi
-; AVX512DQ-NEXT:    shll $9, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    shll $10, %edx
-; AVX512DQ-NEXT:    orl %edi, %edx
-; AVX512DQ-NEXT:    shll $11, %ecx
-; AVX512DQ-NEXT:    orl %edx, %ecx
-; AVX512DQ-NEXT:    kmovw %k1, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $12, %edi
-; AVX512DQ-NEXT:    orl %ecx, %edi
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $13, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $14, %edi
-; AVX512DQ-NEXT:    orl %ecx, %edi
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $15, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    movw %cx, (%rsi)
-; AVX512DQ-NEXT:    movl %edx, %eax
-; AVX512DQ-NEXT:    shll $16, %eax
-; AVX512DQ-NEXT:    shll $17, %edx
-; AVX512DQ-NEXT:    orl %eax, %edx
-; AVX512DQ-NEXT:    kmovw %k0, %eax
-; AVX512DQ-NEXT:    movl %eax, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movl %edi, %r8d
-; AVX512DQ-NEXT:    shll $18, %r8d
-; AVX512DQ-NEXT:    orl %edx, %r8d
-; AVX512DQ-NEXT:    movl %edi, %edx
-; AVX512DQ-NEXT:    shll $19, %edx
-; AVX512DQ-NEXT:    orl %r8d, %edx
-; AVX512DQ-NEXT:    movl %edi, %r8d
-; AVX512DQ-NEXT:    shll $20, %r8d
-; AVX512DQ-NEXT:    orl %edx, %r8d
-; AVX512DQ-NEXT:    movl %edi, %edx
-; AVX512DQ-NEXT:    shll $21, %edx
-; AVX512DQ-NEXT:    orl %r8d, %edx
-; AVX512DQ-NEXT:    shll $22, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    shll $23, %eax
-; AVX512DQ-NEXT:    orl %edi, %eax
-; AVX512DQ-NEXT:    orl %ecx, %eax
-; AVX512DQ-NEXT:    shrl $16, %eax
-; AVX512DQ-NEXT:    movb %al, 2(%rsi)
-; AVX512DQ-NEXT:    retq
+; AVX512F-FAST-LABEL: mask_replication_factor6_vf4:
+; AVX512F-FAST:       # %bb.0:
+; AVX512F-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512F-FAST-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
+; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
+; AVX512F-FAST-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512F-FAST-NEXT:    movw $255, %ax
+; AVX512F-FAST-NEXT:    kmovw %eax, %k1
+; AVX512F-FAST-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-FAST-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512F-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-FAST-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512F-FAST-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512F-FAST-NEXT:    vmovdqa %ymm0, 64(%rdx)
+; AVX512F-FAST-NEXT:    vzeroupper
+; AVX512F-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: mask_replication_factor6_vf4:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQ-SLOW-NEXT:    movw $255, %ax
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm0, %k2
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, 64(%rdx)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: mask_replication_factor6_vf4:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQ-FAST-NEXT:    movw $255, %ax
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm0, %k2
+; AVX512DQ-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, 64(%rdx)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor6_vf4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k3
-; AVX512BW-NEXT:    kshiftrw $3, %k3, %k0
-; AVX512BW-NEXT:    kshiftrw $2, %k3, %k1
-; AVX512BW-NEXT:    kshiftrw $1, %k3, %k2
-; AVX512BW-NEXT:    kmovd %k3, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leal (%rax,%rax,2), %ecx
-; AVX512BW-NEXT:    leal (%rcx,%rax,4), %ecx
-; AVX512BW-NEXT:    leal (%rcx,%rax,8), %ecx
-; AVX512BW-NEXT:    movl %eax, %edx
-; AVX512BW-NEXT:    shll $4, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    shll $5, %eax
-; AVX512BW-NEXT:    orl %edx, %eax
-; AVX512BW-NEXT:    kmovd %k2, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    shll $6, %edx
-; AVX512BW-NEXT:    movl %ecx, %edi
-; AVX512BW-NEXT:    shll $7, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    shll $8, %edx
-; AVX512BW-NEXT:    orl %edi, %edx
-; AVX512BW-NEXT:    movl %ecx, %edi
-; AVX512BW-NEXT:    shll $9, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    shll $10, %edx
-; AVX512BW-NEXT:    orl %edi, %edx
-; AVX512BW-NEXT:    shll $11, %ecx
-; AVX512BW-NEXT:    orl %edx, %ecx
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $12, %edi
-; AVX512BW-NEXT:    orl %ecx, %edi
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $13, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $14, %edi
-; AVX512BW-NEXT:    orl %ecx, %edi
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $15, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    orl %eax, %ecx
-; AVX512BW-NEXT:    movw %cx, (%rsi)
-; AVX512BW-NEXT:    movl %edx, %eax
-; AVX512BW-NEXT:    shll $16, %eax
-; AVX512BW-NEXT:    shll $17, %edx
-; AVX512BW-NEXT:    orl %eax, %edx
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    movl %eax, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movl %edi, %r8d
-; AVX512BW-NEXT:    shll $18, %r8d
-; AVX512BW-NEXT:    orl %edx, %r8d
-; AVX512BW-NEXT:    movl %edi, %edx
-; AVX512BW-NEXT:    shll $19, %edx
-; AVX512BW-NEXT:    orl %r8d, %edx
-; AVX512BW-NEXT:    movl %edi, %r8d
-; AVX512BW-NEXT:    shll $20, %r8d
-; AVX512BW-NEXT:    orl %edx, %r8d
-; AVX512BW-NEXT:    movl %edi, %edx
-; AVX512BW-NEXT:    shll $21, %edx
-; AVX512BW-NEXT:    orl %r8d, %edx
-; AVX512BW-NEXT:    shll $22, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    shll $23, %eax
-; AVX512BW-NEXT:    orl %edi, %eax
-; AVX512BW-NEXT:    orl %ecx, %eax
-; AVX512BW-NEXT:    shrl $16, %eax
-; AVX512BW-NEXT:    movb %al, 2(%rsi)
+; AVX512BW-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    movl $16777215, %eax # imm = 0xFFFFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %ymm0, 64(%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <4 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  store <24 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison)
+  %data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <24 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor6_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf8:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbx
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
-; AVX512F-ONLY-NEXT:    movzbl %dl, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    leaq (%rdx,%rdx,2), %r11
-; AVX512F-ONLY-NEXT:    leaq (%r11,%rdx,4), %r11
-; AVX512F-ONLY-NEXT:    leaq (%r11,%rdx,8), %r11
-; AVX512F-ONLY-NEXT:    movq %rdx, %rbx
-; AVX512F-ONLY-NEXT:    shlq $4, %rbx
-; AVX512F-ONLY-NEXT:    orq %r11, %rbx
-; AVX512F-ONLY-NEXT:    shlq $5, %rdx
-; AVX512F-ONLY-NEXT:    orq %rbx, %rdx
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r11
-; AVX512F-ONLY-NEXT:    shlq $6, %r11
-; AVX512F-ONLY-NEXT:    movq %r10, %rbx
-; AVX512F-ONLY-NEXT:    shlq $7, %rbx
-; AVX512F-ONLY-NEXT:    orq %r11, %rbx
-; AVX512F-ONLY-NEXT:    movq %r10, %r11
-; AVX512F-ONLY-NEXT:    shlq $8, %r11
-; AVX512F-ONLY-NEXT:    orq %rbx, %r11
-; AVX512F-ONLY-NEXT:    movq %r10, %rbx
-; AVX512F-ONLY-NEXT:    shlq $9, %rbx
-; AVX512F-ONLY-NEXT:    orq %r11, %rbx
-; AVX512F-ONLY-NEXT:    movq %r10, %r11
-; AVX512F-ONLY-NEXT:    shlq $10, %r11
-; AVX512F-ONLY-NEXT:    orq %rbx, %r11
-; AVX512F-ONLY-NEXT:    shlq $11, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $12, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $13, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $14, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $15, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $16, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    shlq $17, %r9
-; AVX512F-ONLY-NEXT:    orq %r11, %r9
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %r10
-; AVX512F-ONLY-NEXT:    shlq $18, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $19, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %r8, %r10
-; AVX512F-ONLY-NEXT:    shlq $20, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $21, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %r8, %r10
-; AVX512F-ONLY-NEXT:    shlq $22, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $23, %r8
-; AVX512F-ONLY-NEXT:    orq %r10, %r8
-; AVX512F-ONLY-NEXT:    movzbl %dil, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %rdi
-; AVX512F-ONLY-NEXT:    shlq $24, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $25, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %r9, %rdi
-; AVX512F-ONLY-NEXT:    shlq $26, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $27, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %r9, %rdi
-; AVX512F-ONLY-NEXT:    shlq $28, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    shlq $29, %r9
-; AVX512F-ONLY-NEXT:    orq %rdi, %r9
-; AVX512F-ONLY-NEXT:    movzbl %cl, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $30, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $31, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movl %edi, (%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $32, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $33, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $34, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    shlq $35, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    kmovw %k0, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $36, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $37, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    movq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $38, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $39, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    movq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $40, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $41, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    movzbl %al, %ecx
-; AVX512F-ONLY-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $42, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $43, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $44, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $45, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    shrq $32, %rcx
-; AVX512F-ONLY-NEXT:    movw %cx, 4(%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor6_vf8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbx
-; AVX512DQ-NEXT:    kmovb (%rdi), %k1
-; AVX512DQ-NEXT:    kshiftrb $7, %k1, %k0
-; AVX512DQ-NEXT:    kmovw %k0, %eax
-; AVX512DQ-NEXT:    kshiftrb $6, %k1, %k0
-; AVX512DQ-NEXT:    kshiftrb $5, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %ecx
-; AVX512DQ-NEXT:    kshiftrb $4, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %edi
-; AVX512DQ-NEXT:    kshiftrb $3, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r8d
-; AVX512DQ-NEXT:    kshiftrb $2, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r9d
-; AVX512DQ-NEXT:    kshiftrb $1, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r10d
-; AVX512DQ-NEXT:    kmovw %k1, %edx
-; AVX512DQ-NEXT:    movzbl %dl, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    leaq (%rdx,%rdx,2), %r11
-; AVX512DQ-NEXT:    leaq (%r11,%rdx,4), %r11
-; AVX512DQ-NEXT:    leaq (%r11,%rdx,8), %r11
-; AVX512DQ-NEXT:    movq %rdx, %rbx
-; AVX512DQ-NEXT:    shlq $4, %rbx
-; AVX512DQ-NEXT:    orq %r11, %rbx
-; AVX512DQ-NEXT:    shlq $5, %rdx
-; AVX512DQ-NEXT:    orq %rbx, %rdx
-; AVX512DQ-NEXT:    movzbl %r10b, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r11
-; AVX512DQ-NEXT:    shlq $6, %r11
-; AVX512DQ-NEXT:    movq %r10, %rbx
-; AVX512DQ-NEXT:    shlq $7, %rbx
-; AVX512DQ-NEXT:    orq %r11, %rbx
-; AVX512DQ-NEXT:    movq %r10, %r11
-; AVX512DQ-NEXT:    shlq $8, %r11
-; AVX512DQ-NEXT:    orq %rbx, %r11
-; AVX512DQ-NEXT:    movq %r10, %rbx
-; AVX512DQ-NEXT:    shlq $9, %rbx
-; AVX512DQ-NEXT:    orq %r11, %rbx
-; AVX512DQ-NEXT:    movq %r10, %r11
-; AVX512DQ-NEXT:    shlq $10, %r11
-; AVX512DQ-NEXT:    orq %rbx, %r11
-; AVX512DQ-NEXT:    shlq $11, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    movzbl %r9b, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $12, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $13, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $14, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $15, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $16, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    shlq $17, %r9
-; AVX512DQ-NEXT:    orq %r11, %r9
-; AVX512DQ-NEXT:    movzbl %r8b, %r8d
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %r10
-; AVX512DQ-NEXT:    shlq $18, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $19, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %r8, %r10
-; AVX512DQ-NEXT:    shlq $20, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $21, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %r8, %r10
-; AVX512DQ-NEXT:    shlq $22, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    shlq $23, %r8
-; AVX512DQ-NEXT:    orq %r10, %r8
-; AVX512DQ-NEXT:    movzbl %dil, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %rdi
-; AVX512DQ-NEXT:    shlq $24, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %r9, %r8
-; AVX512DQ-NEXT:    shlq $25, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %r9, %rdi
-; AVX512DQ-NEXT:    shlq $26, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %r9, %r8
-; AVX512DQ-NEXT:    shlq $27, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %r9, %rdi
-; AVX512DQ-NEXT:    shlq $28, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    shlq $29, %r9
-; AVX512DQ-NEXT:    orq %rdi, %r9
-; AVX512DQ-NEXT:    movzbl %cl, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $30, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    shlq $31, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movl %edi, (%rsi)
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $32, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $33, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $34, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    shlq $35, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    kmovw %k0, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $36, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    movq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $37, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    movq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $38, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    movq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $39, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    movq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $40, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $41, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    movzbl %al, %ecx
-; AVX512DQ-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $42, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $43, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $44, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $45, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $47, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    shrq $32, %rcx
-; AVX512DQ-NEXT:    movw %cx, 4(%rsi)
-; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-LABEL: mask_replication_factor6_vf8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovw (%rdi), %k1
-; AVX512BW-NEXT:    kshiftrw $7, %k1, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    kshiftrw $6, %k1, %k0
-; AVX512BW-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512BW-NEXT:    kmovd %k2, %ecx
-; AVX512BW-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512BW-NEXT:    kmovd %k2, %edi
-; AVX512BW-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512BW-NEXT:    kmovd %k2, %r8d
-; AVX512BW-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512BW-NEXT:    kmovd %k2, %r9d
-; AVX512BW-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512BW-NEXT:    kmovd %k2, %r10d
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    movzbl %dl, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    leaq (%rdx,%rdx,2), %r11
-; AVX512BW-NEXT:    leaq (%r11,%rdx,4), %r11
-; AVX512BW-NEXT:    leaq (%r11,%rdx,8), %r11
-; AVX512BW-NEXT:    movq %rdx, %rbx
-; AVX512BW-NEXT:    shlq $4, %rbx
-; AVX512BW-NEXT:    orq %r11, %rbx
-; AVX512BW-NEXT:    shlq $5, %rdx
-; AVX512BW-NEXT:    orq %rbx, %rdx
-; AVX512BW-NEXT:    movzbl %r10b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %r11
-; AVX512BW-NEXT:    shlq $6, %r11
-; AVX512BW-NEXT:    movq %r10, %rbx
-; AVX512BW-NEXT:    shlq $7, %rbx
-; AVX512BW-NEXT:    orq %r11, %rbx
-; AVX512BW-NEXT:    movq %r10, %r11
-; AVX512BW-NEXT:    shlq $8, %r11
-; AVX512BW-NEXT:    orq %rbx, %r11
-; AVX512BW-NEXT:    movq %r10, %rbx
-; AVX512BW-NEXT:    shlq $9, %rbx
-; AVX512BW-NEXT:    orq %r11, %rbx
-; AVX512BW-NEXT:    movq %r10, %r11
-; AVX512BW-NEXT:    shlq $10, %r11
-; AVX512BW-NEXT:    orq %rbx, %r11
-; AVX512BW-NEXT:    shlq $11, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    movzbl %r9b, %r9d
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $12, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $13, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $14, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $15, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $16, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    shlq $17, %r9
-; AVX512BW-NEXT:    orq %r11, %r9
-; AVX512BW-NEXT:    movzbl %r8b, %r8d
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %r10
-; AVX512BW-NEXT:    shlq $18, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $19, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movq %r8, %r10
-; AVX512BW-NEXT:    shlq $20, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $21, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movq %r8, %r10
-; AVX512BW-NEXT:    shlq $22, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    shlq $23, %r8
-; AVX512BW-NEXT:    orq %r10, %r8
-; AVX512BW-NEXT:    movzbl %dil, %r9d
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %rdi
-; AVX512BW-NEXT:    shlq $24, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    movq %r9, %r8
-; AVX512BW-NEXT:    shlq $25, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %r9, %rdi
-; AVX512BW-NEXT:    shlq $26, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    movq %r9, %r8
-; AVX512BW-NEXT:    shlq $27, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %r9, %rdi
-; AVX512BW-NEXT:    shlq $28, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    shlq $29, %r9
-; AVX512BW-NEXT:    orq %rdi, %r9
-; AVX512BW-NEXT:    movzbl %cl, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $30, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    shlq $31, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movl %edi, (%rsi)
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $32, %rdx
-; AVX512BW-NEXT:    movq %rcx, %r8
-; AVX512BW-NEXT:    shlq $33, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $34, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    shlq $35, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    kmovd %k0, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    shlq $36, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    movq %rdx, %rcx
-; AVX512BW-NEXT:    shlq $37, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    shlq $38, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    movq %rdx, %rcx
-; AVX512BW-NEXT:    shlq $39, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    shlq $40, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    shlq $41, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    movzbl %al, %ecx
-; AVX512BW-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $42, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $43, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $44, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $45, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    shlq $47, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    shrq $32, %rcx
-; AVX512BW-NEXT:    movw %cx, 4(%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    retq
-  %src.vec = load <8 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  store <48 x i1> %tgt.mask, ptr %out.vec, align 64
+; AVX512BW-ONLY-LABEL: mask_replication_factor6_vf8:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21,37,37,37,37,38,38,38,38,38,38,39,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512BW-ONLY-NEXT:    vpermd %zmm1, %zmm2, %zmm1
+; AVX512BW-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf8:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512VBMI-ONLY-NEXT:    vpermd %zmm1, %zmm2, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison)
+  store <48 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor6_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf16:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r13
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movl %edi, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    leal (%r10,%r10,2), %ebx
-; AVX512F-ONLY-NEXT:    leal (%rbx,%r12,4), %ebx
-; AVX512F-ONLY-NEXT:    leal (%rbx,%r12,8), %ebx
-; AVX512F-ONLY-NEXT:    movl %r12d, %r13d
-; AVX512F-ONLY-NEXT:    shll $4, %r13d
-; AVX512F-ONLY-NEXT:    orl %ebx, %r13d
-; AVX512F-ONLY-NEXT:    movl %r12d, %ebx
-; AVX512F-ONLY-NEXT:    shll $5, %ebx
-; AVX512F-ONLY-NEXT:    orl %r13d, %ebx
-; AVX512F-ONLY-NEXT:    movl %r12d, %r13d
-; AVX512F-ONLY-NEXT:    shll $6, %r13d
-; AVX512F-ONLY-NEXT:    shll $7, %r12d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movl %r14d, %r13d
-; AVX512F-ONLY-NEXT:    shll $8, %r13d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
-; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
-; AVX512F-ONLY-NEXT:    shll $9, %r12d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r12d
-; AVX512F-ONLY-NEXT:    movl %r14d, %r13d
-; AVX512F-ONLY-NEXT:    shll $10, %r13d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
-; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
-; AVX512F-ONLY-NEXT:    shll $11, %r12d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r12d
-; AVX512F-ONLY-NEXT:    movl %r14d, %r13d
-; AVX512F-ONLY-NEXT:    shll $12, %r13d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512F-ONLY-NEXT:    shll $13, %r14d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movl %ebp, %r13d
-; AVX512F-ONLY-NEXT:    shll $14, %r13d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r13d
-; AVX512F-ONLY-NEXT:    movl %ebp, %r14d
-; AVX512F-ONLY-NEXT:    shll $15, %r14d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
-; AVX512F-ONLY-NEXT:    movl %ebp, %r13d
-; AVX512F-ONLY-NEXT:    shll $16, %r13d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r13d
-; AVX512F-ONLY-NEXT:    movl %ebp, %r14d
-; AVX512F-ONLY-NEXT:    shll $17, %r14d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
-; AVX512F-ONLY-NEXT:    movl %ebp, %r13d
-; AVX512F-ONLY-NEXT:    shll $18, %r13d
-; AVX512F-ONLY-NEXT:    orl %r14d, %r13d
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512F-ONLY-NEXT:    shll $19, %ebp
-; AVX512F-ONLY-NEXT:    orl %r13d, %ebp
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
-; AVX512F-ONLY-NEXT:    shll $20, %r13d
-; AVX512F-ONLY-NEXT:    orl %ebp, %r13d
-; AVX512F-ONLY-NEXT:    movl %r15d, %ebp
-; AVX512F-ONLY-NEXT:    shll $21, %ebp
-; AVX512F-ONLY-NEXT:    orl %r13d, %ebp
-; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
-; AVX512F-ONLY-NEXT:    shll $22, %r13d
-; AVX512F-ONLY-NEXT:    orl %ebp, %r13d
-; AVX512F-ONLY-NEXT:    movl %r15d, %ebp
-; AVX512F-ONLY-NEXT:    shll $23, %ebp
-; AVX512F-ONLY-NEXT:    orl %r13d, %ebp
-; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
-; AVX512F-ONLY-NEXT:    shll $24, %r13d
-; AVX512F-ONLY-NEXT:    orl %ebp, %r13d
-; AVX512F-ONLY-NEXT:    shll $25, %r15d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r15d
-; AVX512F-ONLY-NEXT:    movl %r11d, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movl %r13d, %ebp
-; AVX512F-ONLY-NEXT:    shll $26, %ebp
-; AVX512F-ONLY-NEXT:    orl %r15d, %ebp
-; AVX512F-ONLY-NEXT:    movl %r13d, %r15d
-; AVX512F-ONLY-NEXT:    shll $27, %r15d
-; AVX512F-ONLY-NEXT:    orl %ebp, %r15d
-; AVX512F-ONLY-NEXT:    movl %r13d, %ebp
-; AVX512F-ONLY-NEXT:    shll $28, %ebp
-; AVX512F-ONLY-NEXT:    orl %r15d, %ebp
-; AVX512F-ONLY-NEXT:    movl %r13d, %eax
-; AVX512F-ONLY-NEXT:    shll $29, %eax
-; AVX512F-ONLY-NEXT:    orl %ebp, %eax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
-; AVX512F-ONLY-NEXT:    shll $30, %r13d
-; AVX512F-ONLY-NEXT:    orl %eax, %r13d
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k0
-; AVX512F-ONLY-NEXT:    shll $31, %r11d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r11d
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r13d
-; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
-; AVX512F-ONLY-NEXT:    orl %ebx, %r11d
-; AVX512F-ONLY-NEXT:    movl %r11d, 8(%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r11d
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    leaq (%r11,%r11,2), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r11,4), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r11,8), %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %rbx
-; AVX512F-ONLY-NEXT:    shlq $4, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $5, %r11
-; AVX512F-ONLY-NEXT:    orq %rbx, %r11
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $6, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $7, %rbx
-; AVX512F-ONLY-NEXT:    orq %r9, %rbx
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $8, %r9
-; AVX512F-ONLY-NEXT:    orq %rbx, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $9, %rbx
-; AVX512F-ONLY-NEXT:    orq %r9, %rbx
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $10, %r9
-; AVX512F-ONLY-NEXT:    orq %rbx, %r9
-; AVX512F-ONLY-NEXT:    shlq $11, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $12, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $13, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $14, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $15, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $16, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $17, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    movzbl %dl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $18, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $19, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $20, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $21, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $22, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $24, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $25, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $26, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $27, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $28, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $29, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    movzbl %cl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $30, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $32, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $34, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $35, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $37, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $39, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rcx
-; AVX512F-ONLY-NEXT:    shlq $43, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rcx
-; AVX512F-ONLY-NEXT:    shlq $45, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $49, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $51, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $53, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $55, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $57, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $58, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $59, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %rcx
-; AVX512F-ONLY-NEXT:    shlq $61, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $62, %r10
-; AVX512F-ONLY-NEXT:    orq %rcx, %r10
-; AVX512F-ONLY-NEXT:    shlq $63, %rdi
-; AVX512F-ONLY-NEXT:    orq %r10, %rdi
-; AVX512F-ONLY-NEXT:    orq %r11, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdi, (%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r13
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k6
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor6_vf16:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r13
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %edi
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %ecx
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %edx
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r8d
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r9d
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r14d
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %ebp
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r15d
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, %r11d
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movl %edi, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    leal (%r10,%r10,2), %ebx
-; AVX512DQ-NEXT:    leal (%rbx,%r12,4), %ebx
-; AVX512DQ-NEXT:    leal (%rbx,%r12,8), %ebx
-; AVX512DQ-NEXT:    movl %r12d, %r13d
-; AVX512DQ-NEXT:    shll $4, %r13d
-; AVX512DQ-NEXT:    orl %ebx, %r13d
-; AVX512DQ-NEXT:    movl %r12d, %ebx
-; AVX512DQ-NEXT:    shll $5, %ebx
-; AVX512DQ-NEXT:    orl %r13d, %ebx
-; AVX512DQ-NEXT:    movl %r12d, %r13d
-; AVX512DQ-NEXT:    shll $6, %r13d
-; AVX512DQ-NEXT:    shll $7, %r12d
-; AVX512DQ-NEXT:    orl %r13d, %r12d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movl %r14d, %r13d
-; AVX512DQ-NEXT:    shll $8, %r13d
-; AVX512DQ-NEXT:    orl %r12d, %r13d
-; AVX512DQ-NEXT:    movl %r14d, %r12d
-; AVX512DQ-NEXT:    shll $9, %r12d
-; AVX512DQ-NEXT:    orl %r13d, %r12d
-; AVX512DQ-NEXT:    movl %r14d, %r13d
-; AVX512DQ-NEXT:    shll $10, %r13d
-; AVX512DQ-NEXT:    orl %r12d, %r13d
-; AVX512DQ-NEXT:    movl %r14d, %r12d
-; AVX512DQ-NEXT:    shll $11, %r12d
-; AVX512DQ-NEXT:    orl %r13d, %r12d
-; AVX512DQ-NEXT:    movl %r14d, %r13d
-; AVX512DQ-NEXT:    shll $12, %r13d
-; AVX512DQ-NEXT:    orl %r12d, %r13d
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512DQ-NEXT:    shll $13, %r14d
-; AVX512DQ-NEXT:    orl %r13d, %r14d
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movl %ebp, %r13d
-; AVX512DQ-NEXT:    shll $14, %r13d
-; AVX512DQ-NEXT:    orl %r14d, %r13d
-; AVX512DQ-NEXT:    movl %ebp, %r14d
-; AVX512DQ-NEXT:    shll $15, %r14d
-; AVX512DQ-NEXT:    orl %r13d, %r14d
-; AVX512DQ-NEXT:    movl %ebp, %r13d
-; AVX512DQ-NEXT:    shll $16, %r13d
-; AVX512DQ-NEXT:    orl %r14d, %r13d
-; AVX512DQ-NEXT:    movl %ebp, %r14d
-; AVX512DQ-NEXT:    shll $17, %r14d
-; AVX512DQ-NEXT:    orl %r13d, %r14d
-; AVX512DQ-NEXT:    movl %ebp, %r13d
-; AVX512DQ-NEXT:    shll $18, %r13d
-; AVX512DQ-NEXT:    orl %r14d, %r13d
-; AVX512DQ-NEXT:    kmovw %k1, %r14d
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512DQ-NEXT:    shll $19, %ebp
-; AVX512DQ-NEXT:    orl %r13d, %ebp
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movl %r15d, %r13d
-; AVX512DQ-NEXT:    shll $20, %r13d
-; AVX512DQ-NEXT:    orl %ebp, %r13d
-; AVX512DQ-NEXT:    movl %r15d, %ebp
-; AVX512DQ-NEXT:    shll $21, %ebp
-; AVX512DQ-NEXT:    orl %r13d, %ebp
-; AVX512DQ-NEXT:    movl %r15d, %r13d
-; AVX512DQ-NEXT:    shll $22, %r13d
-; AVX512DQ-NEXT:    orl %ebp, %r13d
-; AVX512DQ-NEXT:    movl %r15d, %ebp
-; AVX512DQ-NEXT:    shll $23, %ebp
-; AVX512DQ-NEXT:    orl %r13d, %ebp
-; AVX512DQ-NEXT:    movl %r15d, %r13d
-; AVX512DQ-NEXT:    shll $24, %r13d
-; AVX512DQ-NEXT:    orl %ebp, %r13d
-; AVX512DQ-NEXT:    shll $25, %r15d
-; AVX512DQ-NEXT:    orl %r13d, %r15d
-; AVX512DQ-NEXT:    movl %r11d, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movl %r13d, %ebp
-; AVX512DQ-NEXT:    shll $26, %ebp
-; AVX512DQ-NEXT:    orl %r15d, %ebp
-; AVX512DQ-NEXT:    movl %r13d, %r15d
-; AVX512DQ-NEXT:    shll $27, %r15d
-; AVX512DQ-NEXT:    orl %ebp, %r15d
-; AVX512DQ-NEXT:    movl %r13d, %ebp
-; AVX512DQ-NEXT:    shll $28, %ebp
-; AVX512DQ-NEXT:    orl %r15d, %ebp
-; AVX512DQ-NEXT:    movl %r13d, %eax
-; AVX512DQ-NEXT:    shll $29, %eax
-; AVX512DQ-NEXT:    orl %ebp, %eax
-; AVX512DQ-NEXT:    kmovw %k1, %r15d
-; AVX512DQ-NEXT:    shll $30, %r13d
-; AVX512DQ-NEXT:    orl %eax, %r13d
-; AVX512DQ-NEXT:    kmovw %k0, %ebp
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k0
-; AVX512DQ-NEXT:    shll $31, %r11d
-; AVX512DQ-NEXT:    orl %r13d, %r11d
-; AVX512DQ-NEXT:    kmovw %k0, %r13d
-; AVX512DQ-NEXT:    movzbl %dil, %edi
-; AVX512DQ-NEXT:    orl %ebx, %r11d
-; AVX512DQ-NEXT:    movl %r11d, 8(%rsi)
-; AVX512DQ-NEXT:    movzbl %bpl, %r11d
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    leaq (%r11,%r11,2), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%r11,4), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%r11,8), %rax
-; AVX512DQ-NEXT:    movq %r11, %rbx
-; AVX512DQ-NEXT:    shlq $4, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $5, %r11
-; AVX512DQ-NEXT:    orq %rbx, %r11
-; AVX512DQ-NEXT:    movzbl %r9b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $6, %r9
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $7, %rbx
-; AVX512DQ-NEXT:    orq %r9, %rbx
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $8, %r9
-; AVX512DQ-NEXT:    orq %rbx, %r9
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $9, %rbx
-; AVX512DQ-NEXT:    orq %r9, %rbx
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $10, %r9
-; AVX512DQ-NEXT:    orq %rbx, %r9
-; AVX512DQ-NEXT:    shlq $11, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movzbl %r8b, %r8d
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $12, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $13, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $14, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $15, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $16, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    shlq $17, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    movzbl %dl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $18, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $19, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $20, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $21, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $22, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $24, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $25, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $26, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $27, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $28, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    shlq $29, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    movzbl %cl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $30, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $32, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $34, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $35, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $37, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $39, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %rcx
-; AVX512DQ-NEXT:    shlq $43, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r14, %rcx
-; AVX512DQ-NEXT:    shlq $45, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $47, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $49, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $51, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $53, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $55, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $57, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $58, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $59, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $60, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r10, %rcx
-; AVX512DQ-NEXT:    shlq $61, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $62, %r10
-; AVX512DQ-NEXT:    orq %rcx, %r10
-; AVX512DQ-NEXT:    shlq $63, %rdi
-; AVX512DQ-NEXT:    orq %r10, %rdi
-; AVX512DQ-NEXT:    orq %r11, %rdi
-; AVX512DQ-NEXT:    movq %rdi, (%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r13
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k6
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor6_vf16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovw (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrw $10, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    kshiftrw $3, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrw $11, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrw $15, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrw $6, %k0, %k1
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movl %edi, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    leal (%r10,%r10,2), %ebx
-; AVX512BW-NEXT:    leal (%rbx,%r12,4), %ebx
-; AVX512BW-NEXT:    leal (%rbx,%r12,8), %ebx
-; AVX512BW-NEXT:    movl %r12d, %r13d
-; AVX512BW-NEXT:    shll $4, %r13d
-; AVX512BW-NEXT:    orl %ebx, %r13d
-; AVX512BW-NEXT:    movl %r12d, %ebx
-; AVX512BW-NEXT:    shll $5, %ebx
-; AVX512BW-NEXT:    orl %r13d, %ebx
-; AVX512BW-NEXT:    movl %r12d, %r13d
-; AVX512BW-NEXT:    shll $6, %r13d
-; AVX512BW-NEXT:    shll $7, %r12d
-; AVX512BW-NEXT:    orl %r13d, %r12d
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movl %r14d, %r13d
-; AVX512BW-NEXT:    shll $8, %r13d
-; AVX512BW-NEXT:    orl %r12d, %r13d
-; AVX512BW-NEXT:    movl %r14d, %r12d
-; AVX512BW-NEXT:    shll $9, %r12d
-; AVX512BW-NEXT:    orl %r13d, %r12d
-; AVX512BW-NEXT:    movl %r14d, %r13d
-; AVX512BW-NEXT:    shll $10, %r13d
-; AVX512BW-NEXT:    orl %r12d, %r13d
-; AVX512BW-NEXT:    movl %r14d, %r12d
-; AVX512BW-NEXT:    shll $11, %r12d
-; AVX512BW-NEXT:    orl %r13d, %r12d
-; AVX512BW-NEXT:    movl %r14d, %r13d
-; AVX512BW-NEXT:    shll $12, %r13d
-; AVX512BW-NEXT:    orl %r12d, %r13d
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512BW-NEXT:    shll $13, %r14d
-; AVX512BW-NEXT:    orl %r13d, %r14d
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movl %ebp, %r13d
-; AVX512BW-NEXT:    shll $14, %r13d
-; AVX512BW-NEXT:    orl %r14d, %r13d
-; AVX512BW-NEXT:    movl %ebp, %r14d
-; AVX512BW-NEXT:    shll $15, %r14d
-; AVX512BW-NEXT:    orl %r13d, %r14d
-; AVX512BW-NEXT:    movl %ebp, %r13d
-; AVX512BW-NEXT:    shll $16, %r13d
-; AVX512BW-NEXT:    orl %r14d, %r13d
-; AVX512BW-NEXT:    movl %ebp, %r14d
-; AVX512BW-NEXT:    shll $17, %r14d
-; AVX512BW-NEXT:    orl %r13d, %r14d
-; AVX512BW-NEXT:    movl %ebp, %r13d
-; AVX512BW-NEXT:    shll $18, %r13d
-; AVX512BW-NEXT:    orl %r14d, %r13d
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512BW-NEXT:    shll $19, %ebp
-; AVX512BW-NEXT:    orl %r13d, %ebp
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movl %r15d, %r13d
-; AVX512BW-NEXT:    shll $20, %r13d
-; AVX512BW-NEXT:    orl %ebp, %r13d
-; AVX512BW-NEXT:    movl %r15d, %ebp
-; AVX512BW-NEXT:    shll $21, %ebp
-; AVX512BW-NEXT:    orl %r13d, %ebp
-; AVX512BW-NEXT:    movl %r15d, %r13d
-; AVX512BW-NEXT:    shll $22, %r13d
-; AVX512BW-NEXT:    orl %ebp, %r13d
-; AVX512BW-NEXT:    movl %r15d, %ebp
-; AVX512BW-NEXT:    shll $23, %ebp
-; AVX512BW-NEXT:    orl %r13d, %ebp
-; AVX512BW-NEXT:    movl %r15d, %r13d
-; AVX512BW-NEXT:    shll $24, %r13d
-; AVX512BW-NEXT:    orl %ebp, %r13d
-; AVX512BW-NEXT:    shll $25, %r15d
-; AVX512BW-NEXT:    orl %r13d, %r15d
-; AVX512BW-NEXT:    movl %r11d, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movl %r13d, %ebp
-; AVX512BW-NEXT:    shll $26, %ebp
-; AVX512BW-NEXT:    orl %r15d, %ebp
-; AVX512BW-NEXT:    movl %r13d, %r15d
-; AVX512BW-NEXT:    shll $27, %r15d
-; AVX512BW-NEXT:    orl %ebp, %r15d
-; AVX512BW-NEXT:    movl %r13d, %ebp
-; AVX512BW-NEXT:    shll $28, %ebp
-; AVX512BW-NEXT:    orl %r15d, %ebp
-; AVX512BW-NEXT:    movl %r13d, %eax
-; AVX512BW-NEXT:    shll $29, %eax
-; AVX512BW-NEXT:    orl %ebp, %eax
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    shll $30, %r13d
-; AVX512BW-NEXT:    orl %eax, %r13d
-; AVX512BW-NEXT:    kmovd %k0, %ebp
-; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
-; AVX512BW-NEXT:    shll $31, %r11d
-; AVX512BW-NEXT:    orl %r13d, %r11d
-; AVX512BW-NEXT:    kmovd %k0, %r13d
-; AVX512BW-NEXT:    movzbl %dil, %edi
-; AVX512BW-NEXT:    orl %ebx, %r11d
-; AVX512BW-NEXT:    movl %r11d, 8(%rsi)
-; AVX512BW-NEXT:    movzbl %bpl, %r11d
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    leaq (%r11,%r11,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r11,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r11,8), %rax
-; AVX512BW-NEXT:    movq %r11, %rbx
-; AVX512BW-NEXT:    shlq $4, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    shlq $5, %r11
-; AVX512BW-NEXT:    orq %rbx, %r11
-; AVX512BW-NEXT:    movzbl %r9b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $6, %r9
-; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    shlq $7, %rbx
-; AVX512BW-NEXT:    orq %r9, %rbx
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $8, %r9
-; AVX512BW-NEXT:    orq %rbx, %r9
-; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    shlq $9, %rbx
-; AVX512BW-NEXT:    orq %r9, %rbx
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $10, %r9
-; AVX512BW-NEXT:    orq %rbx, %r9
-; AVX512BW-NEXT:    shlq $11, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movzbl %r8b, %r8d
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $12, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $13, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $14, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $15, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $16, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    shlq $17, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $18, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $19, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $20, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $21, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $22, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    shlq $23, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    shlq $24, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $25, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    shlq $26, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $27, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    shlq $28, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    shlq $29, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    movzbl %cl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $30, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $32, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $34, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $35, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $37, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $39, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r14, %rcx
-; AVX512BW-NEXT:    shlq $43, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r14, %rcx
-; AVX512BW-NEXT:    shlq $45, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $47, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $49, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $51, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $53, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $55, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $57, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $59, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r10, %rcx
-; AVX512BW-NEXT:    shlq $61, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $62, %r10
-; AVX512BW-NEXT:    orq %rcx, %r10
-; AVX512BW-NEXT:    shlq $63, %rdi
-; AVX512BW-NEXT:    orq %r10, %rdi
-; AVX512BW-NEXT:    orq %r11, %rdi
-; AVX512BW-NEXT:    movq %rdi, (%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm3, %zmm3
+; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm4, %zmm4
+; AVX512BW-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm4 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm5, %zmm5
+; AVX512BW-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm6, %zmm0
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <16 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-  store <96 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison)
+  store <96 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor6_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf32:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r13
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    leaq (%r14,%r14,2), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%r14,4), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%r14,8), %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %r13
-; AVX512F-ONLY-NEXT:    shlq $4, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $5, %r14
-; AVX512F-ONLY-NEXT:    orq %r13, %r14
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $6, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $7, %rbp
-; AVX512F-ONLY-NEXT:    orq %r12, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $8, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $9, %rbp
-; AVX512F-ONLY-NEXT:    orq %r12, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $10, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $11, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $12, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $13, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $14, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $15, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $16, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $17, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $18, %rbp
-; AVX512F-ONLY-NEXT:    orq %r12, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $19, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $20, %rbp
-; AVX512F-ONLY-NEXT:    orq %r12, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $21, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $22, %rbp
-; AVX512F-ONLY-NEXT:    orq %r12, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $23, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $24, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $25, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $26, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $27, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $28, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $29, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $30, %rbp
-; AVX512F-ONLY-NEXT:    orq %r12, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $31, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rbp
-; AVX512F-ONLY-NEXT:    shlq $32, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $33, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rbp
-; AVX512F-ONLY-NEXT:    shlq $34, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $35, %r15
-; AVX512F-ONLY-NEXT:    orq %rbp, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $36, %rbp
-; AVX512F-ONLY-NEXT:    orq %r15, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $37, %r15
-; AVX512F-ONLY-NEXT:    orq %rbp, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $38, %rbp
-; AVX512F-ONLY-NEXT:    orq %r15, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $39, %r15
-; AVX512F-ONLY-NEXT:    orq %rbp, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $40, %rbp
-; AVX512F-ONLY-NEXT:    orq %r15, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $41, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rbp
-; AVX512F-ONLY-NEXT:    shlq $42, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $43, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rbp
-; AVX512F-ONLY-NEXT:    shlq $44, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $45, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rbp
-; AVX512F-ONLY-NEXT:    shlq $46, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $47, %r15
-; AVX512F-ONLY-NEXT:    orq %rbp, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $48, %rbp
-; AVX512F-ONLY-NEXT:    orq %r15, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $49, %r15
-; AVX512F-ONLY-NEXT:    orq %rbp, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $50, %rbp
-; AVX512F-ONLY-NEXT:    orq %r15, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $51, %r15
-; AVX512F-ONLY-NEXT:    orq %rbp, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $52, %rbp
-; AVX512F-ONLY-NEXT:    orq %r15, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $53, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rbp
-; AVX512F-ONLY-NEXT:    shlq $54, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $55, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rbp
-; AVX512F-ONLY-NEXT:    shlq $56, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $57, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rbp
-; AVX512F-ONLY-NEXT:    shlq $58, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $59, %r15
-; AVX512F-ONLY-NEXT:    orq %rbp, %r15
-; AVX512F-ONLY-NEXT:    movl %r12d, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $60, %rbp
-; AVX512F-ONLY-NEXT:    orq %r15, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $61, %r15
-; AVX512F-ONLY-NEXT:    orq %rbp, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k2
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %eax
-; AVX512F-ONLY-NEXT:    shlq $62, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $63, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k2
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512F-ONLY-NEXT:    movq %r15, (%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r12,4), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r12,8), %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $4, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $5, %r14
-; AVX512F-ONLY-NEXT:    orq %r15, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    shlq $7, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $9, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $11, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $13, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $15, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $17, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $18, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $19, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $20, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $21, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $22, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $23, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $24, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $25, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movzbl %bl, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %r13
-; AVX512F-ONLY-NEXT:    shlq $26, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $27, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r13
-; AVX512F-ONLY-NEXT:    shlq $28, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $29, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r13
-; AVX512F-ONLY-NEXT:    shlq $30, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $31, %rbx
-; AVX512F-ONLY-NEXT:    orq %r13, %rbx
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rbx
-; AVX512F-ONLY-NEXT:    shlq $33, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rbx
-; AVX512F-ONLY-NEXT:    shlq $35, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $36, %rbp
-; AVX512F-ONLY-NEXT:    orq %rbx, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $37, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $39, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rbp
-; AVX512F-ONLY-NEXT:    shlq $41, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $42, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $43, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $45, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $47, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $49, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r11
-; AVX512F-ONLY-NEXT:    shlq $50, %r11
-; AVX512F-ONLY-NEXT:    orq %r12, %r11
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %r11, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r11
-; AVX512F-ONLY-NEXT:    shlq $52, %r11
-; AVX512F-ONLY-NEXT:    orq %r12, %r11
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $53, %r12
-; AVX512F-ONLY-NEXT:    orq %r11, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $54, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %r12
-; AVX512F-ONLY-NEXT:    shlq $56, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r12
-; AVX512F-ONLY-NEXT:    shlq $58, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $59, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r13
-; AVX512F-ONLY-NEXT:    shlq $60, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r12d
-; AVX512F-ONLY-NEXT:    shlq $61, %rbx
-; AVX512F-ONLY-NEXT:    orq %r13, %rbx
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %eax
-; AVX512F-ONLY-NEXT:    # kill: def $r10d killed $r10d def $r10
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    shlq $62, %r10
-; AVX512F-ONLY-NEXT:    orq %rbx, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $63, %rbx
-; AVX512F-ONLY-NEXT:    orq %r10, %rbx
-; AVX512F-ONLY-NEXT:    orq %r14, %rbx
-; AVX512F-ONLY-NEXT:    movq %rbx, 8(%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r10
-; AVX512F-ONLY-NEXT:    leaq (%r10,%rax,4), %r10
-; AVX512F-ONLY-NEXT:    leaq (%r10,%rax,8), %rax
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rbx
-; AVX512F-ONLY-NEXT:    shlq $4, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r10, %r9
-; AVX512F-ONLY-NEXT:    shlq $5, %r9
-; AVX512F-ONLY-NEXT:    orq %rbx, %r9
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %rbx
-; AVX512F-ONLY-NEXT:    shlq $7, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $9, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $10, %r8
-; AVX512F-ONLY-NEXT:    orq %r10, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $11, %r10
-; AVX512F-ONLY-NEXT:    orq %r8, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $12, %r8
-; AVX512F-ONLY-NEXT:    orq %r10, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $13, %r10
-; AVX512F-ONLY-NEXT:    orq %r8, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $14, %r8
-; AVX512F-ONLY-NEXT:    orq %r10, %r8
-; AVX512F-ONLY-NEXT:    shlq $15, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $16, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $17, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $18, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $19, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $20, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $21, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $22, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $23, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $24, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $25, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $26, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $27, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movzbl %cl, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $28, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $29, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $30, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $31, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %rcx
-; AVX512F-ONLY-NEXT:    shlq $35, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %rcx
-; AVX512F-ONLY-NEXT:    shlq $37, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $39, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %rbp, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $41, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $43, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $45, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %rcx
-; AVX512F-ONLY-NEXT:    shlq $47, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %rcx
-; AVX512F-ONLY-NEXT:    shlq $49, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $51, %r11
-; AVX512F-ONLY-NEXT:    orq %rax, %r11
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %r11, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $53, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $55, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movl %edx, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $58, %rcx
-; AVX512F-ONLY-NEXT:    orq %r12, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $59, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $60, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $61, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $62, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movzbl %dl, %ecx
-; AVX512F-ONLY-NEXT:    shlq $63, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    orq %r9, %rcx
-; AVX512F-ONLY-NEXT:    movq %rcx, 16(%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r13
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm4
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm10
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm0
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm2
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm5, %zmm5
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm7
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm9, %zmm9
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm11, %zmm3
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm11 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm9 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 512(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 576(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 640(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 704(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor6_vf32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r13
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
-; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %edx
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %ecx
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %edi
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r8d
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r9d
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r10d
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r11d
-; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %ebx
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k1, %r14d
-; AVX512DQ-NEXT:    movzbl %r14b, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    leaq (%r14,%r14,2), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%r14,4), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%r14,8), %r12
-; AVX512DQ-NEXT:    movq %r14, %r13
-; AVX512DQ-NEXT:    shlq $4, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    shlq $5, %r14
-; AVX512DQ-NEXT:    orq %r13, %r14
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $6, %r12
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $7, %rbp
-; AVX512DQ-NEXT:    orq %r12, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $8, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $9, %rbp
-; AVX512DQ-NEXT:    orq %r12, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $10, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512DQ-NEXT:    shlq $11, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $12, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $13, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $14, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $15, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $16, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    kmovw %k2, %r13d
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512DQ-NEXT:    shlq $17, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    movzbl %r13b, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $18, %rbp
-; AVX512DQ-NEXT:    orq %r12, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $19, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $20, %rbp
-; AVX512DQ-NEXT:    orq %r12, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $21, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $22, %rbp
-; AVX512DQ-NEXT:    orq %r12, %rbp
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
-; AVX512DQ-NEXT:    shlq $23, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movzbl %r12b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $24, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $25, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $26, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $27, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $28, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $29, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    movzbl %r15b, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $30, %rbp
-; AVX512DQ-NEXT:    orq %r12, %rbp
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
-; AVX512DQ-NEXT:    shlq $31, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rbp
-; AVX512DQ-NEXT:    shlq $32, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $33, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r15, %rbp
-; AVX512DQ-NEXT:    shlq $34, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    kmovw %k2, %r13d
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512DQ-NEXT:    shlq $35, %r15
-; AVX512DQ-NEXT:    orq %rbp, %r15
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $36, %rbp
-; AVX512DQ-NEXT:    orq %r15, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $37, %r15
-; AVX512DQ-NEXT:    orq %rbp, %r15
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $38, %rbp
-; AVX512DQ-NEXT:    orq %r15, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $39, %r15
-; AVX512DQ-NEXT:    orq %rbp, %r15
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $40, %rbp
-; AVX512DQ-NEXT:    orq %r15, %rbp
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512DQ-NEXT:    shlq $41, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rbp
-; AVX512DQ-NEXT:    shlq $42, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $43, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r15, %rbp
-; AVX512DQ-NEXT:    shlq $44, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $45, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r15, %rbp
-; AVX512DQ-NEXT:    shlq $46, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    kmovw %k2, %r13d
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512DQ-NEXT:    shlq $47, %r15
-; AVX512DQ-NEXT:    orq %rbp, %r15
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $48, %rbp
-; AVX512DQ-NEXT:    orq %r15, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $49, %r15
-; AVX512DQ-NEXT:    orq %rbp, %r15
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $50, %rbp
-; AVX512DQ-NEXT:    orq %r15, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $51, %r15
-; AVX512DQ-NEXT:    orq %rbp, %r15
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $52, %rbp
-; AVX512DQ-NEXT:    orq %r15, %rbp
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k2
-; AVX512DQ-NEXT:    shlq $53, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rbp
-; AVX512DQ-NEXT:    shlq $54, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $55, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r15, %rbp
-; AVX512DQ-NEXT:    shlq $56, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $57, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r15, %rbp
-; AVX512DQ-NEXT:    shlq $58, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $59, %r15
-; AVX512DQ-NEXT:    orq %rbp, %r15
-; AVX512DQ-NEXT:    movl %r12d, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $60, %rbp
-; AVX512DQ-NEXT:    orq %r15, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $61, %r15
-; AVX512DQ-NEXT:    orq %rbp, %r15
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
-; AVX512DQ-NEXT:    movzbl %r12b, %eax
-; AVX512DQ-NEXT:    shlq $62, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $63, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    kmovw %k2, %r13d
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k2
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512DQ-NEXT:    movq %r15, (%rsi)
-; AVX512DQ-NEXT:    movzbl %r14b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%r12,4), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%r12,8), %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $4, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $5, %r14
-; AVX512DQ-NEXT:    orq %r15, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    shlq $7, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %r13b, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $9, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $11, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512DQ-NEXT:    shlq $13, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movzbl %r12b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $15, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $17, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $18, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %r15d
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512DQ-NEXT:    shlq $19, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $20, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $21, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $22, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $23, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $24, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
-; AVX512DQ-NEXT:    shlq $25, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movzbl %bl, %ebx
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %r13
-; AVX512DQ-NEXT:    shlq $26, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $27, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r13
-; AVX512DQ-NEXT:    shlq $28, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $29, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r13
-; AVX512DQ-NEXT:    shlq $30, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    shlq $31, %rbx
-; AVX512DQ-NEXT:    orq %r13, %rbx
-; AVX512DQ-NEXT:    kmovw %k0, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rbx
-; AVX512DQ-NEXT:    shlq $33, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rbx
-; AVX512DQ-NEXT:    shlq $35, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $36, %rbp
-; AVX512DQ-NEXT:    orq %rbx, %rbp
-; AVX512DQ-NEXT:    kmovw %k1, %ebx
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k1
-; AVX512DQ-NEXT:    shlq $37, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $39, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r15, %rbp
-; AVX512DQ-NEXT:    shlq $41, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $42, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %ebp
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512DQ-NEXT:    shlq $43, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $45, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $47, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %r15d
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512DQ-NEXT:    shlq $49, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %r11b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r11
-; AVX512DQ-NEXT:    shlq $50, %r11
-; AVX512DQ-NEXT:    orq %r12, %r11
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %r11, %r12
-; AVX512DQ-NEXT:    movq %rax, %r11
-; AVX512DQ-NEXT:    shlq $52, %r11
-; AVX512DQ-NEXT:    orq %r12, %r11
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $53, %r12
-; AVX512DQ-NEXT:    orq %r11, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $54, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %r11d
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %r12
-; AVX512DQ-NEXT:    shlq $56, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r12
-; AVX512DQ-NEXT:    shlq $58, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $59, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r13
-; AVX512DQ-NEXT:    shlq $60, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k0, %r12d
-; AVX512DQ-NEXT:    shlq $61, %rbx
-; AVX512DQ-NEXT:    orq %r13, %rbx
-; AVX512DQ-NEXT:    movzbl %r10b, %eax
-; AVX512DQ-NEXT:    # kill: def $r10d killed $r10d def $r10
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    shlq $62, %r10
-; AVX512DQ-NEXT:    orq %rbx, %r10
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $63, %rbx
-; AVX512DQ-NEXT:    orq %r10, %rbx
-; AVX512DQ-NEXT:    orq %r14, %rbx
-; AVX512DQ-NEXT:    movq %rbx, 8(%rsi)
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r10
-; AVX512DQ-NEXT:    leaq (%r10,%rax,4), %r10
-; AVX512DQ-NEXT:    leaq (%r10,%rax,8), %rax
-; AVX512DQ-NEXT:    movzbl %r9b, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rbx
-; AVX512DQ-NEXT:    shlq $4, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r10, %r9
-; AVX512DQ-NEXT:    shlq $5, %r9
-; AVX512DQ-NEXT:    orq %rbx, %r9
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movq %r10, %rbx
-; AVX512DQ-NEXT:    shlq $7, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $9, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movzbl %r8b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $10, %r8
-; AVX512DQ-NEXT:    orq %r10, %r8
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $11, %r10
-; AVX512DQ-NEXT:    orq %r8, %r10
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $12, %r8
-; AVX512DQ-NEXT:    orq %r10, %r8
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $13, %r10
-; AVX512DQ-NEXT:    orq %r8, %r10
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $14, %r8
-; AVX512DQ-NEXT:    orq %r10, %r8
-; AVX512DQ-NEXT:    shlq $15, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movzbl %dil, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $16, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $17, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $18, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $19, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $20, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    shlq $21, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $22, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $23, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $24, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $25, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $26, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $27, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movzbl %cl, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $28, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $29, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $30, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    shlq $31, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $33, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %rbp, %rcx
-; AVX512DQ-NEXT:    shlq $35, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %rbp, %rcx
-; AVX512DQ-NEXT:    shlq $37, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $39, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %rbp, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $41, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $43, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $45, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r11, %rcx
-; AVX512DQ-NEXT:    shlq $47, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r11, %rcx
-; AVX512DQ-NEXT:    shlq $49, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $51, %r11
-; AVX512DQ-NEXT:    orq %rax, %r11
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %r11, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $53, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $55, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $57, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movl %edx, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $58, %rcx
-; AVX512DQ-NEXT:    orq %r12, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $59, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $60, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $61, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $62, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movzbl %dl, %ecx
-; AVX512DQ-NEXT:    shlq $63, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    orq %r9, %rcx
-; AVX512DQ-NEXT:    movq %rcx, 16(%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r13
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm11, %zmm3
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm11 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
+; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm9 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
+; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
+; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 512(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 576(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 640(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 704(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor6_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovd (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrd $31, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrd $26, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrd $25, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    kshiftrd $24, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrd $23, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrd $22, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrd $21, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrd $15, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrd $14, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrd $5, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $1, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $2, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k0, %ebx
-; AVX512BW-NEXT:    movzbl %bl, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    leaq (%rbx,%rbx,2), %r12
-; AVX512BW-NEXT:    leaq (%r12,%rbx,4), %r12
-; AVX512BW-NEXT:    leaq (%r12,%rbx,8), %r12
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $4, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    shlq $5, %rbx
-; AVX512BW-NEXT:    orq %r13, %rbx
-; AVX512BW-NEXT:    movzbl %r15b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $6, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $7, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $8, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $9, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $10, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrd $3, %k0, %k1
-; AVX512BW-NEXT:    shlq $11, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %r13b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $12, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $13, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $14, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $15, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $16, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $4, %k0, %k1
-; AVX512BW-NEXT:    shlq $17, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movzbl %r12b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $18, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $19, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $20, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $21, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $22, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $10, %k0, %k1
-; AVX512BW-NEXT:    shlq $23, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %r15b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $24, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $25, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $26, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $27, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $28, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    shlq $29, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movzbl %r14b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $30, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $6, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $32, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $33, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $34, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $7, %k0, %k1
-; AVX512BW-NEXT:    shlq $35, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $36, %r13
-; AVX512BW-NEXT:    orq %r14, %r13
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $37, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $38, %r13
-; AVX512BW-NEXT:    orq %r14, %r13
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $39, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $40, %r13
-; AVX512BW-NEXT:    orq %r14, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $8, %k0, %k1
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $42, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $43, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $44, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $45, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $46, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $9, %k0, %k1
-; AVX512BW-NEXT:    shlq $47, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $48, %r13
-; AVX512BW-NEXT:    orq %r14, %r13
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $49, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $50, %r13
-; AVX512BW-NEXT:    orq %r14, %r13
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $51, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $52, %r13
-; AVX512BW-NEXT:    orq %r14, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $13, %k0, %k1
-; AVX512BW-NEXT:    shlq $53, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $54, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $55, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $56, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $57, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $58, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    shlq $59, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    movl %r15d, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $60, %r13
-; AVX512BW-NEXT:    orq %r14, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $61, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $12, %k0, %k1
-; AVX512BW-NEXT:    movzbl %r15b, %r15d
-; AVX512BW-NEXT:    shlq $62, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $63, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrd $11, %k0, %k1
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kmovd (%rdi), %k5
+; AVX512BW-NEXT:    movw $-3, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k5, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-5, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-9, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-17, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k7
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-33, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    movw $-65, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k1, %k2
+; AVX512BW-NEXT:    kshiftrd $1, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-129, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512BW-NEXT:    kmovd %eax, %k6
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $2, %k5, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k3
+; AVX512BW-NEXT:    kmovq %k2, %k4
+; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k2
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k4, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $29, %k5, %k0
+; AVX512BW-NEXT:    kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $30, %k5, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k3, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k3, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k3, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k3, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k3, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k3
+; AVX512BW-NEXT:    kshiftrd $31, %k5, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k7, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftlw $14, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k3, %k3
+; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT:    korw %k1, %k3, %k1
+; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $26, %k5, %k3
+; AVX512BW-NEXT:    kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrd $27, %k5, %k7
+; AVX512BW-NEXT:    kmovq %k5, %k3
+; AVX512BW-NEXT:    kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftlw $15, %k7, %k7
+; AVX512BW-NEXT:    kshiftrw $13, %k7, %k5
+; AVX512BW-NEXT:    korw %k5, %k6, %k5
+; AVX512BW-NEXT:    kandw %k4, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $12, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $11, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $10, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $9, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $8, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrd $28, %k3, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k5, %k4
+; AVX512BW-NEXT:    kshiftlw $1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $1, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k2, %k4, %k2
+; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $24, %k0, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $10, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k4, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $25, %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $7, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $4, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k4
+; AVX512BW-NEXT:    kmovq %k0, %k1
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $21, %k2, %k1
+; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k5
+; AVX512BW-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k5, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k5, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k5, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kandw %k6, %k3, %k3
+; AVX512BW-NEXT:    kshiftrd $22, %k2, %k4
+; AVX512BW-NEXT:    kmovq %k2, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $11, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $7, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k4
+; AVX512BW-NEXT:    kshiftrd $23, %k6, %k5
+; AVX512BW-NEXT:    kmovq %k6, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftlw $14, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kshiftlw $1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $1, %k4, %k4
+; AVX512BW-NEXT:    korw %k3, %k4, %k3
+; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm4 {%k3} {z}
+; AVX512BW-NEXT:    kmovq %k7, %k4
+; AVX512BW-NEXT:    kshiftrd $18, %k7, %k6
+; AVX512BW-NEXT:    kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k6, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k3
+; AVX512BW-NEXT:    kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrd $19, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrd $20, %k4, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
+; AVX512BW-NEXT:    korw %k2, %k5, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
 ; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
-; AVX512BW-NEXT:    movq %rax, (%rsi)
-; AVX512BW-NEXT:    movzbl %bl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    leaq (%r15,%r15,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r12,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r12,8), %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $4, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rbx
-; AVX512BW-NEXT:    shlq $5, %rbx
-; AVX512BW-NEXT:    orq %r15, %rbx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %r13b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $12, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrd $17, %k0, %k1
-; AVX512BW-NEXT:    shlq $13, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %r14b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    shlq $14, %r14
-; AVX512BW-NEXT:    orq %r15, %r14
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    shlq $15, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    shlq $16, %r14
-; AVX512BW-NEXT:    orq %r15, %r14
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    shlq $17, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $18, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $18, %k0, %k1
-; AVX512BW-NEXT:    shlq $19, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $20, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $21, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $22, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $23, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %rbp
-; AVX512BW-NEXT:    shlq $24, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $19, %k0, %k1
-; AVX512BW-NEXT:    shlq $25, %r12
-; AVX512BW-NEXT:    orq %rbp, %r12
-; AVX512BW-NEXT:    movzbl %r11b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r11
-; AVX512BW-NEXT:    shlq $26, %r11
-; AVX512BW-NEXT:    orq %r12, %r11
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $27, %r12
-; AVX512BW-NEXT:    orq %r11, %r12
-; AVX512BW-NEXT:    movq %rax, %r11
-; AVX512BW-NEXT:    shlq $28, %r11
-; AVX512BW-NEXT:    orq %r12, %r11
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $29, %r12
-; AVX512BW-NEXT:    orq %r11, %r12
-; AVX512BW-NEXT:    movq %rax, %rbp
-; AVX512BW-NEXT:    shlq $30, %rbp
-; AVX512BW-NEXT:    orq %r12, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrd $20, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $32, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $34, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $36, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $27, %k0, %k1
-; AVX512BW-NEXT:    shlq $37, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $39, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r14, %rbp
-; AVX512BW-NEXT:    shlq $41, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $42, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrd $28, %k0, %k1
-; AVX512BW-NEXT:    shlq $43, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r15, %r14
-; AVX512BW-NEXT:    shlq $45, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r15, %r14
-; AVX512BW-NEXT:    shlq $47, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrd $29, %k0, %k1
-; AVX512BW-NEXT:    shlq $49, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r11, %r15
-; AVX512BW-NEXT:    shlq $51, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r11, %r15
-; AVX512BW-NEXT:    shlq $53, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $30, %k0, %k0
-; AVX512BW-NEXT:    shlq $55, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r12, %r11
-; AVX512BW-NEXT:    shlq $57, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r12, %r11
-; AVX512BW-NEXT:    shlq $59, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    kmovd %k0, %r11d
-; AVX512BW-NEXT:    shlq $61, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %r10b, %eax
-; AVX512BW-NEXT:    # kill: def $r10d killed $r10d def $r10
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    shlq $62, %r10
-; AVX512BW-NEXT:    orq %r12, %r10
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $63, %r12
-; AVX512BW-NEXT:    orq %r10, %r12
-; AVX512BW-NEXT:    orq %rbx, %r12
-; AVX512BW-NEXT:    movq %r12, 8(%rsi)
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r10
-; AVX512BW-NEXT:    leaq (%r10,%rax,4), %r10
-; AVX512BW-NEXT:    leaq (%r10,%rax,8), %rax
-; AVX512BW-NEXT:    movzbl %r9b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %rbx
-; AVX512BW-NEXT:    shlq $4, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r10, %r9
-; AVX512BW-NEXT:    shlq $5, %r9
-; AVX512BW-NEXT:    orq %rbx, %r9
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r10, %rbx
-; AVX512BW-NEXT:    shlq $7, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    shlq $9, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movzbl %r8b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $10, %r8
-; AVX512BW-NEXT:    orq %r10, %r8
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    shlq $11, %r10
-; AVX512BW-NEXT:    orq %r8, %r10
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $12, %r8
-; AVX512BW-NEXT:    orq %r10, %r8
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    shlq $13, %r10
-; AVX512BW-NEXT:    orq %r8, %r10
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $14, %r8
-; AVX512BW-NEXT:    orq %r10, %r8
-; AVX512BW-NEXT:    shlq $15, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movzbl %dil, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $16, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $17, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $18, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $19, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $20, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    shlq $21, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $22, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $23, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $24, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $25, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $26, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    shlq $27, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movzbl %cl, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $28, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $29, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $30, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    shlq $31, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    shlq $33, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rcx
-; AVX512BW-NEXT:    shlq $35, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rcx
-; AVX512BW-NEXT:    shlq $37, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $39, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %r14, %rcx
-; AVX512BW-NEXT:    shlq $41, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r14, %rcx
-; AVX512BW-NEXT:    shlq $43, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $45, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $47, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $49, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $51, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r11, %rcx
-; AVX512BW-NEXT:    shlq $53, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r11, %rcx
-; AVX512BW-NEXT:    shlq $55, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $57, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movl %edx, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $58, %rcx
-; AVX512BW-NEXT:    orq %r11, %rcx
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $59, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $60, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $61, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $62, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    shlq $63, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    orq %r9, %rcx
-; AVX512BW-NEXT:    movq %rcx, 16(%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $17, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $13, %k0, %k2
+; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kandw %k6, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrd $14, %k0, %k4
+; AVX512BW-NEXT:    kmovq %k0, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $11, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $7, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k4
+; AVX512BW-NEXT:    kshiftrd $15, %k7, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k4, %k4
+; AVX512BW-NEXT:    kshiftlw $14, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kshiftlw $1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $1, %k4, %k4
+; AVX512BW-NEXT:    korw %k3, %k4, %k3
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm7 {%k3} {z}
+; AVX512BW-NEXT:    kmovq %k7, %k3
+; AVX512BW-NEXT:    kshiftrd $10, %k7, %k0
+; AVX512BW-NEXT:    kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k0, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrd $11, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $9, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $8, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrd $12, %k3, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
+; AVX512BW-NEXT:    korw %k2, %k5, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm8 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $8, %k3, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $9, %k3, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k3, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $5, %k1, %k2
+; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kandw %k6, %k2, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k7
+; AVX512BW-NEXT:    kshiftrw $14, %k7, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k7, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k7, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrd $6, %k1, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $11, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $7, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k4
+; AVX512BW-NEXT:    kshiftrd $7, %k1, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k4, %k4
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k4, %k4
+; AVX512BW-NEXT:    kshiftlw $14, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k4, %k4
+; AVX512BW-NEXT:    kshiftlw $1, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $1, %k4, %k4
+; AVX512BW-NEXT:    korw %k3, %k4, %k3
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm10 {%k3} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $14, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrd $3, %k1, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT:    kshiftrw $13, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k4, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrd $4, %k1, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k3, %k0
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
+; AVX512BW-NEXT:    korw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k7, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm11 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 512(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 576(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 640(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 704(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <32 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-  store <192 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison)
+  store <192 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor6_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r13
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k3
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
-; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k2
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k2, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    leaq (%r14,%r14,2), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%r14,4), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%r14,8), %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %r13
-; AVX512F-ONLY-NEXT:    shlq $4, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $5, %r14
-; AVX512F-ONLY-NEXT:    orq %r13, %r14
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $6, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $7, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $8, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $9, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $10, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $11, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $12, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $13, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $14, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $15, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $16, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $17, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $18, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $19, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $20, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $21, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $22, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $23, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $24, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $25, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $26, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $27, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $28, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $29, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $30, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $31, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $32, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $33, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $34, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $35, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $36, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $37, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $38, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $39, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $40, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $41, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $42, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $43, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $44, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $45, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $46, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $47, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $48, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $49, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $50, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $52, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $53, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $54, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $55, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $56, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $57, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $58, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $59, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r15d
-; AVX512F-ONLY-NEXT:    movl %ebp, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $60, %rbp
-; AVX512F-ONLY-NEXT:    orq %r12, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $62, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $63, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    orq %r14, %r12
-; AVX512F-ONLY-NEXT:    movq %r12, 24(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r14d
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    leaq (%r12,%r12,2), %r14
-; AVX512F-ONLY-NEXT:    leaq (%r14,%r12,4), %r14
-; AVX512F-ONLY-NEXT:    leaq (%r14,%r12,8), %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $4, %r13
-; AVX512F-ONLY-NEXT:    orq %r14, %r13
-; AVX512F-ONLY-NEXT:    shlq $5, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r14
-; AVX512F-ONLY-NEXT:    shlq $6, %r14
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $7, %rbp
-; AVX512F-ONLY-NEXT:    orq %r14, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r14
-; AVX512F-ONLY-NEXT:    shlq $8, %r14
-; AVX512F-ONLY-NEXT:    orq %rbp, %r14
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $9, %rbp
-; AVX512F-ONLY-NEXT:    orq %r14, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r14
-; AVX512F-ONLY-NEXT:    shlq $10, %r14
-; AVX512F-ONLY-NEXT:    orq %rbp, %r14
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $11, %r13
-; AVX512F-ONLY-NEXT:    orq %r14, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rbp
-; AVX512F-ONLY-NEXT:    shlq $12, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r14, %r13
-; AVX512F-ONLY-NEXT:    shlq $13, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r14, %rbp
-; AVX512F-ONLY-NEXT:    shlq $14, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r14, %r13
-; AVX512F-ONLY-NEXT:    shlq $15, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r14, %rbp
-; AVX512F-ONLY-NEXT:    shlq $16, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $17, %r14
-; AVX512F-ONLY-NEXT:    orq %rbp, %r14
-; AVX512F-ONLY-NEXT:    movzbl %r13b, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $18, %rbp
-; AVX512F-ONLY-NEXT:    orq %r14, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r14
-; AVX512F-ONLY-NEXT:    shlq $19, %r14
-; AVX512F-ONLY-NEXT:    orq %rbp, %r14
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $20, %rbp
-; AVX512F-ONLY-NEXT:    orq %r14, %rbp
-; AVX512F-ONLY-NEXT:    movq %r13, %r14
-; AVX512F-ONLY-NEXT:    shlq $21, %r14
-; AVX512F-ONLY-NEXT:    orq %rbp, %r14
-; AVX512F-ONLY-NEXT:    movq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $22, %rbp
-; AVX512F-ONLY-NEXT:    orq %r14, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $23, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rbp
-; AVX512F-ONLY-NEXT:    shlq $24, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r14, %r13
-; AVX512F-ONLY-NEXT:    shlq $25, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r14, %rbp
-; AVX512F-ONLY-NEXT:    shlq $26, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movq %r14, %r13
-; AVX512F-ONLY-NEXT:    shlq $27, %r13
-; AVX512F-ONLY-NEXT:    orq %rbp, %r13
-; AVX512F-ONLY-NEXT:    movq %r14, %rbp
-; AVX512F-ONLY-NEXT:    shlq $28, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    shlq $29, %r14
-; AVX512F-ONLY-NEXT:    orq %rbp, %r14
-; AVX512F-ONLY-NEXT:    movzbl %bl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $30, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $31, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r14
-; AVX512F-ONLY-NEXT:    shlq $33, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $35, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rbx
-; AVX512F-ONLY-NEXT:    shlq $37, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rbx
-; AVX512F-ONLY-NEXT:    shlq $39, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $41, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r14
-; AVX512F-ONLY-NEXT:    shlq $43, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r14
-; AVX512F-ONLY-NEXT:    shlq $45, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k3, %k3
-; AVX512F-ONLY-NEXT:    shlq $47, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rbx
-; AVX512F-ONLY-NEXT:    shlq $49, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rbx
-; AVX512F-ONLY-NEXT:    shlq $51, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k3
-; AVX512F-ONLY-NEXT:    shlq $53, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r14
-; AVX512F-ONLY-NEXT:    shlq $55, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r14
-; AVX512F-ONLY-NEXT:    shlq $57, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %rbx, %rax
-; AVX512F-ONLY-NEXT:    shlq $58, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $59, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movl %ebp, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r14
-; AVX512F-ONLY-NEXT:    shlq $60, %r14
-; AVX512F-ONLY-NEXT:    orq %rbx, %r14
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $61, %r13
-; AVX512F-ONLY-NEXT:    orq %r14, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k2, %k3
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %ebx
-; AVX512F-ONLY-NEXT:    shlq $62, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r13
-; AVX512F-ONLY-NEXT:    shlq $63, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k2, %k3
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k2, %k3
-; AVX512F-ONLY-NEXT:    movq %r13, (%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r15,2), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r13,4), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r13,8), %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $5, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $6, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $9, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $11, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $13, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $15, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $17, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k2, %k2
-; AVX512F-ONLY-NEXT:    shlq $19, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $20, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $21, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $22, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $23, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $24, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $25, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $26, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $27, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $28, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $29, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $30, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $31, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $33, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $35, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $37, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $39, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $43, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $45, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $47, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $49, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $53, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $55, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $57, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $58, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $59, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    movl %ebp, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    shlq $62, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $63, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %r12, 32(%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,8), %rax
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $5, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $9, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $11, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $13, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $15, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $17, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $19, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $20, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $21, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $22, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $23, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $24, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $25, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $26, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $27, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $28, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $29, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $30, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %r13
-; AVX512F-ONLY-NEXT:    shlq $32, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $33, %r14
-; AVX512F-ONLY-NEXT:    orq %r13, %r14
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $35, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $37, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $39, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $43, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512F-ONLY-NEXT:    shlq $45, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $47, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $49, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $53, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $55, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    movl %ebp, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $58, %r13
-; AVX512F-ONLY-NEXT:    orq %r14, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $59, %r14
-; AVX512F-ONLY-NEXT:    orq %r13, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $60, %r13
-; AVX512F-ONLY-NEXT:    orq %r14, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rbp
-; AVX512F-ONLY-NEXT:    shlq $61, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $62, %r12
-; AVX512F-ONLY-NEXT:    orq %rbp, %r12
-; AVX512F-ONLY-NEXT:    shlq $63, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %rax, 40(%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r15d
-; AVX512F-ONLY-NEXT:    movl %r15d, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%rax,4), %r12
-; AVX512F-ONLY-NEXT:    leaq (%r12,%rax,8), %rax
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %r11
-; AVX512F-ONLY-NEXT:    shlq $5, %r11
-; AVX512F-ONLY-NEXT:    orq %r12, %r11
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $9, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $11, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $13, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $15, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $17, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $19, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $20, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $21, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $22, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $23, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $24, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $25, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $26, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $27, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $28, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $29, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $30, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r13
-; AVX512F-ONLY-NEXT:    shlq $32, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $33, %r10
-; AVX512F-ONLY-NEXT:    orq %r13, %r10
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $35, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $37, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $39, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $43, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $45, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $47, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $49, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r10
-; AVX512F-ONLY-NEXT:    shlq $53, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r10
-; AVX512F-ONLY-NEXT:    shlq $55, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movl %ebp, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $58, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $59, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-ONLY-NEXT:    shlq $62, %r10
-; AVX512F-ONLY-NEXT:    orq %r12, %r10
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %ebp
-; AVX512F-ONLY-NEXT:    shlq $63, %rbp
-; AVX512F-ONLY-NEXT:    orq %r10, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k0
-; AVX512F-ONLY-NEXT:    orq %r11, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r11d
-; AVX512F-ONLY-NEXT:    movq %rbp, 16(%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    leaq (%rbx,%rbx,2), %r9
-; AVX512F-ONLY-NEXT:    leaq (%r9,%rax,4), %r9
-; AVX512F-ONLY-NEXT:    leaq (%r9,%rax,8), %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $4, %rbx
-; AVX512F-ONLY-NEXT:    orq %r9, %rbx
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $5, %r9
-; AVX512F-ONLY-NEXT:    orq %rbx, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $6, %rbx
-; AVX512F-ONLY-NEXT:    shlq $7, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %rbx
-; AVX512F-ONLY-NEXT:    shlq $8, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $9, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %rbx
-; AVX512F-ONLY-NEXT:    shlq $10, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $11, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %rbx
-; AVX512F-ONLY-NEXT:    shlq $12, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $13, %r8
-; AVX512F-ONLY-NEXT:    orq %rbx, %r8
-; AVX512F-ONLY-NEXT:    movzbl %dil, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $14, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $15, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $16, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $17, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $18, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    shlq $19, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movzbl %dl, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $20, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $21, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $22, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $24, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $25, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movzbl %cl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $26, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $27, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $28, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $29, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $30, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rcx
-; AVX512F-ONLY-NEXT:    shlq $32, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %rcx
-; AVX512F-ONLY-NEXT:    shlq $34, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %rcx
-; AVX512F-ONLY-NEXT:    shlq $36, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $37, %r10
-; AVX512F-ONLY-NEXT:    orq %rcx, %r10
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $39, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $41, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $43, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $45, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $47, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $50, %rcx
-; AVX512F-ONLY-NEXT:    orq %r12, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $51, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $52, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $53, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $54, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    movq %r11, %rcx
-; AVX512F-ONLY-NEXT:    shlq $56, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %rcx
-; AVX512F-ONLY-NEXT:    shlq $58, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $59, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %rcx
-; AVX512F-ONLY-NEXT:    shlq $60, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $61, %r11
-; AVX512F-ONLY-NEXT:    orq %rcx, %r11
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    shlq $62, %r14
-; AVX512F-ONLY-NEXT:    orq %r11, %r14
-; AVX512F-ONLY-NEXT:    shlq $63, %r15
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    orq %r9, %r15
-; AVX512F-ONLY-NEXT:    movq %r15, 8(%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r13
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm8
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm9, %zmm10
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm11, %zmm12
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm13, %zmm14
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm3
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm2, %zmm15
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm7, %zmm16
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm9, %zmm17
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm11, %zmm18
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm13, %zmm19
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm1, %zmm4
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm2, %zmm20
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm7, %zmm21
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm9, %zmm22
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm11, %zmm23
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm13, %zmm5
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm7
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm9
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm11
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm0
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm13 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm11 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm23, %zmm23, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm23 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm22, %zmm22, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm22 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm21, %zmm21, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm21 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm20, %zmm20, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm20 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm19, %zmm19, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm19 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm18, %zmm18, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm18 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm17, %zmm17, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm17 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm16, %zmm16, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1088(%rsi), %zmm15 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1152(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1216(%rsi), %zmm14 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1280(%rsi), %zmm12 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1344(%rsi), %zmm10 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1408(%rsi), %zmm8 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1472(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 1472(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 1408(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 1344(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 1280(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 1216(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 1152(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, 1088(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm17, 960(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm18, 896(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm19, 832(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 768(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm20, 704(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm21, 640(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm22, 576(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm23, 512(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor6_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r13
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
-; AVX512DQ-NEXT:    kmovw (%rdi), %k3
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm4
 ; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
-; AVX512DQ-NEXT:    kmovw 4(%rdi), %k2
-; AVX512DQ-NEXT:    kmovw 6(%rdi), %k1
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-NEXT:    kshiftrw $15, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ecx
-; AVX512DQ-NEXT:    kshiftrw $14, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %edx
-; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %edi
-; AVX512DQ-NEXT:    kshiftrw $12, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r8d
-; AVX512DQ-NEXT:    kshiftrw $11, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r9d
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r10d
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r11d
-; AVX512DQ-NEXT:    kshiftrw $5, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ebx
-; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k4
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    movzbl %r14b, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    leaq (%r14,%r14,2), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%r14,4), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%r14,8), %r12
-; AVX512DQ-NEXT:    movq %r14, %r13
-; AVX512DQ-NEXT:    shlq $4, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    shlq $5, %r14
-; AVX512DQ-NEXT:    orq %r13, %r14
-; AVX512DQ-NEXT:    movzbl %r15b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $6, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $7, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $8, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $9, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $10, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k4
-; AVX512DQ-NEXT:    shlq $11, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %r13b, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $12, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $13, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $14, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $15, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $16, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k4
-; AVX512DQ-NEXT:    shlq $17, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movzbl %r12b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $18, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $19, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $20, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $21, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $22, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k4
-; AVX512DQ-NEXT:    shlq $23, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %r15b, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $24, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $25, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $26, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $27, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $28, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k4
-; AVX512DQ-NEXT:    shlq $29, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movzbl %r12b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $30, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $31, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $32, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $33, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $34, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k4
-; AVX512DQ-NEXT:    shlq $35, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %r15b, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $36, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $37, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $38, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $39, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $40, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k4
-; AVX512DQ-NEXT:    shlq $41, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $42, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $43, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $44, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $45, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $46, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k4
-; AVX512DQ-NEXT:    shlq $47, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $48, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $49, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $50, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $52, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $1, %k3, %k4
-; AVX512DQ-NEXT:    shlq $53, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movzbl %r12b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $54, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $55, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $56, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $57, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $58, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    shlq $59, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %r15d
-; AVX512DQ-NEXT:    movl %ebp, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $60, %rbp
-; AVX512DQ-NEXT:    orq %r12, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512DQ-NEXT:    shlq $62, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $63, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    orq %r14, %r12
-; AVX512DQ-NEXT:    movq %r12, 24(%rsi)
-; AVX512DQ-NEXT:    kmovw %k3, %r14d
-; AVX512DQ-NEXT:    movzbl %r14b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    leaq (%r12,%r12,2), %r14
-; AVX512DQ-NEXT:    leaq (%r14,%r12,4), %r14
-; AVX512DQ-NEXT:    leaq (%r14,%r12,8), %r14
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $4, %r13
-; AVX512DQ-NEXT:    orq %r14, %r13
-; AVX512DQ-NEXT:    shlq $5, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r14
-; AVX512DQ-NEXT:    shlq $6, %r14
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $7, %rbp
-; AVX512DQ-NEXT:    orq %r14, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r14
-; AVX512DQ-NEXT:    shlq $8, %r14
-; AVX512DQ-NEXT:    orq %rbp, %r14
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $9, %rbp
-; AVX512DQ-NEXT:    orq %r14, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r14
-; AVX512DQ-NEXT:    shlq $10, %r14
-; AVX512DQ-NEXT:    orq %rbp, %r14
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k4
-; AVX512DQ-NEXT:    shlq $11, %r13
-; AVX512DQ-NEXT:    orq %r14, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rbp
-; AVX512DQ-NEXT:    shlq $12, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r14, %r13
-; AVX512DQ-NEXT:    shlq $13, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r14, %rbp
-; AVX512DQ-NEXT:    shlq $14, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r14, %r13
-; AVX512DQ-NEXT:    shlq $15, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r14, %rbp
-; AVX512DQ-NEXT:    shlq $16, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $4, %k3, %k4
-; AVX512DQ-NEXT:    shlq $17, %r14
-; AVX512DQ-NEXT:    orq %rbp, %r14
-; AVX512DQ-NEXT:    movzbl %r13b, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $18, %rbp
-; AVX512DQ-NEXT:    orq %r14, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r14
-; AVX512DQ-NEXT:    shlq $19, %r14
-; AVX512DQ-NEXT:    orq %rbp, %r14
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $20, %rbp
-; AVX512DQ-NEXT:    orq %r14, %rbp
-; AVX512DQ-NEXT:    movq %r13, %r14
-; AVX512DQ-NEXT:    shlq $21, %r14
-; AVX512DQ-NEXT:    orq %rbp, %r14
-; AVX512DQ-NEXT:    movq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $22, %rbp
-; AVX512DQ-NEXT:    orq %r14, %rbp
-; AVX512DQ-NEXT:    kmovw %k4, %r14d
-; AVX512DQ-NEXT:    kshiftrw $10, %k3, %k4
-; AVX512DQ-NEXT:    shlq $23, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movzbl %r14b, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rbp
-; AVX512DQ-NEXT:    shlq $24, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r14, %r13
-; AVX512DQ-NEXT:    shlq $25, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r14, %rbp
-; AVX512DQ-NEXT:    shlq $26, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movq %r14, %r13
-; AVX512DQ-NEXT:    shlq $27, %r13
-; AVX512DQ-NEXT:    orq %rbp, %r13
-; AVX512DQ-NEXT:    movq %r14, %rbp
-; AVX512DQ-NEXT:    shlq $28, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    shlq $29, %r14
-; AVX512DQ-NEXT:    orq %rbp, %r14
-; AVX512DQ-NEXT:    movzbl %bl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $30, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $6, %k3, %k4
-; AVX512DQ-NEXT:    shlq $31, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r14
-; AVX512DQ-NEXT:    shlq $33, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %r14d
-; AVX512DQ-NEXT:    kshiftrw $7, %k3, %k4
-; AVX512DQ-NEXT:    shlq $35, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r14, %rbx
-; AVX512DQ-NEXT:    shlq $37, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r14, %rbx
-; AVX512DQ-NEXT:    shlq $39, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %ebx
-; AVX512DQ-NEXT:    kshiftrw $8, %k3, %k4
-; AVX512DQ-NEXT:    shlq $41, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r14
-; AVX512DQ-NEXT:    shlq $43, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r14
-; AVX512DQ-NEXT:    shlq $45, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %r14d
-; AVX512DQ-NEXT:    kshiftrw $9, %k3, %k3
-; AVX512DQ-NEXT:    shlq $47, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r14, %rbx
-; AVX512DQ-NEXT:    shlq $49, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r14, %rbx
-; AVX512DQ-NEXT:    shlq $51, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %ebx
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k3
-; AVX512DQ-NEXT:    shlq $53, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r14
-; AVX512DQ-NEXT:    shlq $55, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r14
-; AVX512DQ-NEXT:    shlq $57, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %rbx, %rax
-; AVX512DQ-NEXT:    shlq $58, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    shlq $59, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movl %ebp, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r14
-; AVX512DQ-NEXT:    shlq $60, %r14
-; AVX512DQ-NEXT:    orq %rbx, %r14
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $61, %r13
-; AVX512DQ-NEXT:    orq %r14, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %r14d
-; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k3
-; AVX512DQ-NEXT:    movzbl %bpl, %ebx
-; AVX512DQ-NEXT:    shlq $62, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %rbx, %r13
-; AVX512DQ-NEXT:    shlq $63, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k3
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k3
-; AVX512DQ-NEXT:    movq %r13, (%rsi)
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    leaq (%r15,%r15,2), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%r13,4), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%r13,8), %r15
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $5, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $6, %r12
-; AVX512DQ-NEXT:    shlq $7, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $9, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $11, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k3
-; AVX512DQ-NEXT:    shlq $13, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $15, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $17, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k2
-; AVX512DQ-NEXT:    shlq $19, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $20, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $21, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $22, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $23, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $24, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512DQ-NEXT:    shlq $25, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $26, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $27, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $28, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $29, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $30, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    shlq $31, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $33, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $35, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r13d
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512DQ-NEXT:    shlq $37, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $39, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512DQ-NEXT:    shlq $43, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $45, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $47, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r13d
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512DQ-NEXT:    shlq $49, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $53, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512DQ-NEXT:    shlq $55, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $57, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $58, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $59, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $60, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    movl %ebp, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    shlq $62, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $63, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %r12, 32(%rsi)
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%rax,8), %rax
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $5, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $7, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512DQ-NEXT:    shlq $9, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $11, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $13, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512DQ-NEXT:    shlq $15, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $17, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $19, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $20, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k2
-; AVX512DQ-NEXT:    shlq $21, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $22, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $23, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $24, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $25, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $26, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    shlq $27, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %r14b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $28, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $29, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $30, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k2
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %r13
-; AVX512DQ-NEXT:    shlq $32, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
-; AVX512DQ-NEXT:    shlq $33, %r14
-; AVX512DQ-NEXT:    orq %r13, %r14
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $35, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $37, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k2
-; AVX512DQ-NEXT:    shlq $39, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $43, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512DQ-NEXT:    shlq $45, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $47, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $49, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %r14d
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $53, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $55, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    shlq $57, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    movl %ebp, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $58, %r13
-; AVX512DQ-NEXT:    orq %r14, %r13
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $59, %r14
-; AVX512DQ-NEXT:    orq %r13, %r14
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $60, %r13
-; AVX512DQ-NEXT:    orq %r14, %r13
-; AVX512DQ-NEXT:    movq %r12, %rbp
-; AVX512DQ-NEXT:    shlq $61, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    kmovw %k1, %r14d
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k1
-; AVX512DQ-NEXT:    shlq $62, %r12
-; AVX512DQ-NEXT:    orq %rbp, %r12
-; AVX512DQ-NEXT:    shlq $63, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %rax, 40(%rsi)
-; AVX512DQ-NEXT:    movzbl %r14b, %r15d
-; AVX512DQ-NEXT:    movl %r15d, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%rax,4), %r12
-; AVX512DQ-NEXT:    leaq (%r12,%rax,8), %rax
-; AVX512DQ-NEXT:    movzbl %r11b, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %r11
-; AVX512DQ-NEXT:    shlq $5, %r11
-; AVX512DQ-NEXT:    orq %r12, %r11
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $7, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %ebp
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512DQ-NEXT:    shlq $9, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $11, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $13, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %ebp
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k1
-; AVX512DQ-NEXT:    shlq $15, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $17, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $19, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $20, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %ebp
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k1
-; AVX512DQ-NEXT:    shlq $21, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $22, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $23, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $24, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $25, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $26, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    shlq $27, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %r10b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $28, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $29, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $30, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %ebp
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k1
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r13
-; AVX512DQ-NEXT:    shlq $32, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k1
-; AVX512DQ-NEXT:    shlq $33, %r10
-; AVX512DQ-NEXT:    orq %r13, %r10
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $35, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $37, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %r10d
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k1
-; AVX512DQ-NEXT:    shlq $39, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $43, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k1
-; AVX512DQ-NEXT:    shlq $45, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $47, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $49, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    kmovw %k1, %r13d
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r10
-; AVX512DQ-NEXT:    shlq $53, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r13, %r10
-; AVX512DQ-NEXT:    shlq $55, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    shlq $57, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movl %ebp, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $58, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $59, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $60, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k1, %r13d
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512DQ-NEXT:    shlq $62, %r10
-; AVX512DQ-NEXT:    orq %r12, %r10
-; AVX512DQ-NEXT:    kmovw %k1, %r12d
-; AVX512DQ-NEXT:    movzbl %bpl, %ebp
-; AVX512DQ-NEXT:    shlq $63, %rbp
-; AVX512DQ-NEXT:    orq %r10, %rbp
-; AVX512DQ-NEXT:    kmovw %k0, %r10d
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k0
-; AVX512DQ-NEXT:    orq %r11, %rbp
-; AVX512DQ-NEXT:    kmovw %k0, %r11d
-; AVX512DQ-NEXT:    movq %rbp, 16(%rsi)
-; AVX512DQ-NEXT:    movzbl %r9b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    leaq (%rbx,%rbx,2), %r9
-; AVX512DQ-NEXT:    leaq (%r9,%rax,4), %r9
-; AVX512DQ-NEXT:    leaq (%r9,%rax,8), %r9
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $4, %rbx
-; AVX512DQ-NEXT:    orq %r9, %rbx
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $5, %r9
-; AVX512DQ-NEXT:    orq %rbx, %r9
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $6, %rbx
-; AVX512DQ-NEXT:    shlq $7, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movzbl %r8b, %r8d
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %rbx
-; AVX512DQ-NEXT:    shlq $8, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $9, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r8, %rbx
-; AVX512DQ-NEXT:    shlq $10, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $11, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r8, %rbx
-; AVX512DQ-NEXT:    shlq $12, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $13, %r8
-; AVX512DQ-NEXT:    orq %rbx, %r8
-; AVX512DQ-NEXT:    movzbl %dil, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $14, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $15, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $16, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $17, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $18, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    shlq $19, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movzbl %dl, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $20, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $21, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $22, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $24, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $25, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movzbl %cl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $26, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $27, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $28, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $29, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $30, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rcx
-; AVX512DQ-NEXT:    shlq $32, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r10, %rcx
-; AVX512DQ-NEXT:    shlq $34, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r10, %rcx
-; AVX512DQ-NEXT:    shlq $36, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $37, %r10
-; AVX512DQ-NEXT:    orq %rcx, %r10
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $39, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $41, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $43, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $45, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $47, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $49, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $50, %rcx
-; AVX512DQ-NEXT:    orq %r12, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $51, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $52, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $53, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $54, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    movq %r11, %rcx
-; AVX512DQ-NEXT:    shlq $56, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r11, %rcx
-; AVX512DQ-NEXT:    shlq $58, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $59, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r11, %rcx
-; AVX512DQ-NEXT:    shlq $60, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $61, %r11
-; AVX512DQ-NEXT:    orq %rcx, %r11
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    shlq $62, %r14
-; AVX512DQ-NEXT:    orq %r11, %r14
-; AVX512DQ-NEXT:    shlq $63, %r15
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    orq %r9, %r15
-; AVX512DQ-NEXT:    movq %r15, 8(%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r13
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm5
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm9, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm11, %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm13, %zmm14
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm3
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm2, %zmm15
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm7, %zmm16
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm9, %zmm17
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm11, %zmm18
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm13, %zmm19
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm1, %zmm4
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm2, %zmm20
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm7, %zmm21
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm9, %zmm22
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm11, %zmm23
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm13, %zmm5
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm11
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm0
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm13 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm11 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm23, %k1
+; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm23 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm22, %k1
+; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm22 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm21, %k1
+; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm21 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm20, %k1
+; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm20 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm19, %k1
+; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm19 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm18, %k1
+; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm18 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm17, %k1
+; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm17 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm16, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1088(%rsi), %zmm15 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1152(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1216(%rsi), %zmm14 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1280(%rsi), %zmm12 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1344(%rsi), %zmm10 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1408(%rsi), %zmm8 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1472(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 1472(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 1408(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 1344(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 1280(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 1216(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 1152(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 1088(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 960(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 896(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 832(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 768(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 704(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 640(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 576(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 512(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor6_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovq (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrq $15, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    kshiftrq $14, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrq $13, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrq $12, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrq $26, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrq $22, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrq $58, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrq $44, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $42, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrq $5, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrq $1, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $2, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k0, %r10d
-; AVX512BW-NEXT:    movzbl %r10b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    leaq (%r10,%r10,2), %r12
-; AVX512BW-NEXT:    leaq (%r12,%r10,4), %r12
-; AVX512BW-NEXT:    leaq (%r12,%r10,8), %r12
-; AVX512BW-NEXT:    movq %r10, %r13
-; AVX512BW-NEXT:    shlq $4, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    shlq $5, %r10
-; AVX512BW-NEXT:    orq %r13, %r10
-; AVX512BW-NEXT:    movzbl %r15b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $6, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $7, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $8, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $9, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $10, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $3, %k0, %k1
-; AVX512BW-NEXT:    shlq $11, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %r13b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $12, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $13, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $14, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $15, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $16, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $4, %k0, %k1
-; AVX512BW-NEXT:    shlq $17, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movzbl %r12b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $18, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $19, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $20, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $21, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $22, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $10, %k0, %k1
-; AVX512BW-NEXT:    shlq $23, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %r15b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $24, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $25, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $26, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $27, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $28, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    shlq $29, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movzbl %r9b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $30, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $32, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r9, %r12
-; AVX512BW-NEXT:    shlq $33, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $34, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $7, %k0, %k1
-; AVX512BW-NEXT:    shlq $35, %r9
-; AVX512BW-NEXT:    orq %r13, %r9
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $36, %r13
-; AVX512BW-NEXT:    orq %r9, %r13
-; AVX512BW-NEXT:    movq %r12, %r9
-; AVX512BW-NEXT:    shlq $37, %r9
-; AVX512BW-NEXT:    orq %r13, %r9
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $38, %r13
-; AVX512BW-NEXT:    orq %r9, %r13
-; AVX512BW-NEXT:    movq %r12, %r9
-; AVX512BW-NEXT:    shlq $39, %r9
-; AVX512BW-NEXT:    orq %r13, %r9
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $40, %r13
-; AVX512BW-NEXT:    orq %r9, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrq $8, %k0, %k1
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $42, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r9, %r12
-; AVX512BW-NEXT:    shlq $43, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $44, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r9, %r12
-; AVX512BW-NEXT:    shlq $45, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $46, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $9, %k0, %k1
-; AVX512BW-NEXT:    shlq $47, %r9
-; AVX512BW-NEXT:    orq %r13, %r9
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $48, %r13
-; AVX512BW-NEXT:    orq %r9, %r13
-; AVX512BW-NEXT:    movq %r12, %r9
-; AVX512BW-NEXT:    shlq $49, %r9
-; AVX512BW-NEXT:    orq %r13, %r9
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $50, %r13
-; AVX512BW-NEXT:    orq %r9, %r13
-; AVX512BW-NEXT:    movq %r12, %r9
-; AVX512BW-NEXT:    shlq $51, %r9
-; AVX512BW-NEXT:    orq %r13, %r9
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $52, %r13
-; AVX512BW-NEXT:    orq %r9, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrq $43, %k0, %k1
-; AVX512BW-NEXT:    shlq $53, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $54, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r9, %r12
-; AVX512BW-NEXT:    shlq $55, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $56, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r9, %r12
-; AVX512BW-NEXT:    shlq $57, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $58, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    shlq $59, %r9
-; AVX512BW-NEXT:    orq %r13, %r9
-; AVX512BW-NEXT:    movl %r15d, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $60, %r13
-; AVX512BW-NEXT:    orq %r9, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $61, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $45, %k0, %k1
-; AVX512BW-NEXT:    movzbl %r15b, %r9d
-; AVX512BW-NEXT:    shlq $62, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $63, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %rax, (%rsi)
-; AVX512BW-NEXT:    movzbl %r13b, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movzbl %r8b, %r10d
-; AVX512BW-NEXT:    movl %r10d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r13,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r13,8), %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $4, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $5, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    shlq $7, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $9, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $46, %k0, %k1
-; AVX512BW-NEXT:    shlq $13, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $15, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $17, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $47, %k0, %k1
-; AVX512BW-NEXT:    shlq $19, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $21, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $23, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $48, %k0, %k1
-; AVX512BW-NEXT:    shlq $25, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $27, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $28, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $29, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $49, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $33, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $35, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $50, %k0, %k1
-; AVX512BW-NEXT:    shlq $37, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $39, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $51, %k0, %k1
-; AVX512BW-NEXT:    shlq $43, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $45, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $47, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $52, %k0, %k1
-; AVX512BW-NEXT:    shlq $49, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $51, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $53, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $53, %k0, %k1
-; AVX512BW-NEXT:    shlq $55, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $57, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $59, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $54, %k0, %k1
-; AVX512BW-NEXT:    shlq $61, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %eax
-; AVX512BW-NEXT:    movl %ebp, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    shlq $62, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $63, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $55, %k0, %k1
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movq %r12, 32(%rsi)
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r15
-; AVX512BW-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512BW-NEXT:    leaq (%r15,%rax,8), %rax
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $4, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $5, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $56, %k0, %k1
-; AVX512BW-NEXT:    shlq $9, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $13, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $57, %k0, %k1
-; AVX512BW-NEXT:    shlq $15, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $17, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $19, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $63, %k0, %k1
-; AVX512BW-NEXT:    shlq $21, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $23, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $25, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    shlq $27, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %r14b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $28, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $29, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $30, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $59, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %r13
-; AVX512BW-NEXT:    shlq $32, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $60, %k0, %k1
-; AVX512BW-NEXT:    shlq $33, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $35, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $37, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrq $61, %k0, %k1
-; AVX512BW-NEXT:    shlq $39, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $43, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $62, %k0, %k1
-; AVX512BW-NEXT:    shlq $45, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $47, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $49, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrq $21, %k0, %k1
-; AVX512BW-NEXT:    shlq $51, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $53, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $55, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r14, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    shlq $57, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movl %ebp, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $58, %r12
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    shlq $59, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $60, %r12
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r14d
-; AVX512BW-NEXT:    kshiftrq $23, %k0, %k1
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    shlq $62, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    shlq $63, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movq %r12, 40(%rsi)
-; AVX512BW-NEXT:    movzbl %r14b, %r15d
-; AVX512BW-NEXT:    movl %r15d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r12
-; AVX512BW-NEXT:    leaq (%r12,%rax,4), %r12
-; AVX512BW-NEXT:    leaq (%r12,%rax,8), %rax
-; AVX512BW-NEXT:    movzbl %bl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $4, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $5, %rbx
-; AVX512BW-NEXT:    orq %r12, %rbx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $24, %k0, %k1
-; AVX512BW-NEXT:    shlq $9, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $13, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $25, %k0, %k1
-; AVX512BW-NEXT:    shlq $15, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $17, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $19, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $31, %k0, %k1
-; AVX512BW-NEXT:    shlq $21, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $23, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $25, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    shlq $27, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %r11b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $28, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $29, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $30, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $27, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %r13
-; AVX512BW-NEXT:    shlq $32, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $28, %k0, %k1
-; AVX512BW-NEXT:    shlq $33, %r11
-; AVX512BW-NEXT:    orq %r13, %r11
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r12, %r11
-; AVX512BW-NEXT:    shlq $35, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r12, %r11
-; AVX512BW-NEXT:    shlq $37, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrq $29, %k0, %k1
-; AVX512BW-NEXT:    shlq $39, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r11, %r12
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r11, %r12
-; AVX512BW-NEXT:    shlq $43, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $30, %k0, %k1
-; AVX512BW-NEXT:    shlq $45, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r12, %r11
-; AVX512BW-NEXT:    shlq $47, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r12, %r11
-; AVX512BW-NEXT:    shlq $49, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrq $11, %k0, %k1
-; AVX512BW-NEXT:    shlq $51, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r11, %r12
-; AVX512BW-NEXT:    shlq $53, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r11, %r12
-; AVX512BW-NEXT:    shlq $55, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    shlq $57, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movl %ebp, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $58, %r12
-; AVX512BW-NEXT:    orq %r11, %r12
-; AVX512BW-NEXT:    movq %rax, %r11
-; AVX512BW-NEXT:    shlq $59, %r11
-; AVX512BW-NEXT:    orq %r12, %r11
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $60, %r12
-; AVX512BW-NEXT:    orq %r11, %r12
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrq $33, %k0, %k1
-; AVX512BW-NEXT:    shlq $62, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    shlq $63, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $34, %k0, %k1
-; AVX512BW-NEXT:    orq %rbx, %r13
-; AVX512BW-NEXT:    movq %r13, 16(%rsi)
-; AVX512BW-NEXT:    movzbl %al, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    leaq (%rbx,%rbx,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%rbx,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%rbx,8), %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $4, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    shlq $5, %rbx
-; AVX512BW-NEXT:    orq %r13, %rbx
-; AVX512BW-NEXT:    movzbl %r12b, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $35, %k0, %k1
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $13, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $15, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $36, %k0, %k1
-; AVX512BW-NEXT:    shlq $17, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $19, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $21, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $37, %k0, %k1
-; AVX512BW-NEXT:    shlq $23, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $25, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $27, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $28, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $38, %k0, %k1
-; AVX512BW-NEXT:    shlq $29, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $31, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $33, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $39, %k0, %k1
-; AVX512BW-NEXT:    shlq $35, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $37, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $39, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $40, %k0, %k1
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $43, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $45, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $41, %k0, %k1
-; AVX512BW-NEXT:    shlq $47, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbp, %r12
-; AVX512BW-NEXT:    shlq $49, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbp, %r12
-; AVX512BW-NEXT:    shlq $51, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $16, %k0, %k1
-; AVX512BW-NEXT:    shlq $53, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $55, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %rbp
-; AVX512BW-NEXT:    shlq $57, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $58, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $17, %k0, %k1
-; AVX512BW-NEXT:    shlq $59, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r8, %r13
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $18, %k0, %k1
-; AVX512BW-NEXT:    shlq $62, %r8
-; AVX512BW-NEXT:    orq %r13, %r8
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $19, %k0, %k1
-; AVX512BW-NEXT:    shlq $63, %r10
-; AVX512BW-NEXT:    orq %r8, %r10
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrq $20, %k0, %k0
-; AVX512BW-NEXT:    orq %rbx, %r10
-; AVX512BW-NEXT:    kmovd %k0, %ebx
-; AVX512BW-NEXT:    movq %r10, 24(%rsi)
-; AVX512BW-NEXT:    movzbl %r11b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    leaq (%r9,%r9,2), %r9
-; AVX512BW-NEXT:    leaq (%r9,%rax,4), %r9
-; AVX512BW-NEXT:    leaq (%r9,%rax,8), %r9
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    shlq $4, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $5, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    shlq $6, %r10
-; AVX512BW-NEXT:    shlq $7, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movzbl %dil, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r10
-; AVX512BW-NEXT:    shlq $8, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $9, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %rdi, %r10
-; AVX512BW-NEXT:    shlq $10, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $11, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %rdi, %r10
-; AVX512BW-NEXT:    shlq $12, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    shlq $13, %rdi
-; AVX512BW-NEXT:    orq %r10, %rdi
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $14, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $15, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $16, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $17, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $18, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    shlq $19, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movzbl %cl, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $20, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $21, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $22, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $23, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $24, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $25, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $26, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $27, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $28, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $29, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $30, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rcx
-; AVX512BW-NEXT:    shlq $32, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rcx
-; AVX512BW-NEXT:    shlq $34, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rcx
-; AVX512BW-NEXT:    shlq $36, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $37, %rbp
-; AVX512BW-NEXT:    orq %rcx, %rbp
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $39, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $41, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $43, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $45, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $47, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $49, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r8, %rcx
-; AVX512BW-NEXT:    shlq $51, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r8, %rcx
-; AVX512BW-NEXT:    shlq $53, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $55, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rbx, %rcx
-; AVX512BW-NEXT:    shlq $57, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %rbx, %rcx
-; AVX512BW-NEXT:    shlq $59, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    shlq $61, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    shlq $62, %r14
-; AVX512BW-NEXT:    orq %rbx, %r14
-; AVX512BW-NEXT:    shlq $63, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    orq %r9, %r15
-; AVX512BW-NEXT:    movq %r15, 8(%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    kmovq (%rdi), %k5
+; AVX512BW-NEXT:    movw $-3, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k5, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-5, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-9, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-17, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-33, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    movw $-65, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $1, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-129, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k0, %k3
+; AVX512BW-NEXT:    kshiftrq $2, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k7
+; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k7, %k7
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT:    korw %k0, %k6, %k6
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k6} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $3, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $4, %k5, %k1
+; AVX512BW-NEXT:    kmovq %k5, %k7
+; AVX512BW-NEXT:    kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $5, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k7} {z}
+; AVX512BW-NEXT:    kandw %k4, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $6, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $7, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $8, %k7, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $9, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $10, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $11, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $12, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $13, %k7, %k1
+; AVX512BW-NEXT:    kmovq %k7, %k2
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm4 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k2, %k7
+; AVX512BW-NEXT:    kshiftrq $14, %k2, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $15, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $16, %k5, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $17, %k5, %k1
+; AVX512BW-NEXT:    kmovq %k5, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $18, %k7, %k1
+; AVX512BW-NEXT:    kmovq %k7, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k3, %k7
+; AVX512BW-NEXT:    kshiftrq $19, %k3, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $20, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k7, %k4
+; AVX512BW-NEXT:    kshiftrq $21, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $22, %k4, %k1
+; AVX512BW-NEXT:    kmovq %k4, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $23, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm8 {%k1} {z}
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $24, %k5, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $25, %k5, %k1
+; AVX512BW-NEXT:    kmovq %k5, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $26, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm9 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $27, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $28, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $29, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $30, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $31, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm11 {%k1} {z}
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $32, %k5, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $33, %k5, %k1
+; AVX512BW-NEXT:    kmovq %k5, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $34, %k7, %k1
+; AVX512BW-NEXT:    kmovq %k7, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm12 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k4, %k7
+; AVX512BW-NEXT:    kshiftrq $35, %k4, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $36, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k7, %k3
+; AVX512BW-NEXT:    kshiftrq $37, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm13 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $38, %k3, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $39, %k3, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm14 {%k1} {z}
+; AVX512BW-NEXT:    kmovq %k3, %k7
+; AVX512BW-NEXT:    kshiftrq $40, %k3, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $41, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $42, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm15 {%k7} {z}
+; AVX512BW-NEXT:    kandw %k3, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $43, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $44, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $45, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $46, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $47, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1088(%rsi), %zmm17 {%k1} {z}
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $48, %k5, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $49, %k5, %k1
+; AVX512BW-NEXT:    kmovq %k5, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $50, %k2, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 1152(%rsi), %zmm18 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k2, %k7
+; AVX512BW-NEXT:    kshiftrq $51, %k2, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $52, %k7, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $53, %k7, %k1
+; AVX512BW-NEXT:    kmovq %k7, %k4
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 1216(%rsi), %zmm19 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k4, %k7
+; AVX512BW-NEXT:    kshiftrq $54, %k4, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $55, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1280(%rsi), %zmm20 {%k1} {z}
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $56, %k5, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $57, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $58, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 1344(%rsi), %zmm21 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $59, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $60, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $61, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 1408(%rsi), %zmm22 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $62, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $63, %k5, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k6, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k4
+; AVX512BW-NEXT:    korw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k2, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k1, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 1472(%rsi), %zmm23 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, 1472(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 1408(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, 1344(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 1280(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1216(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 1152(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 1088(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 960(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 896(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 832(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 768(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 704(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 640(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 512(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <64 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-  store <384 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+  %data = call <384 x i32> @llvm.masked.load.v384i32.p0(ptr %in.vec, i32 64, <384 x i1> %tgt.mask, <384 x i32> poison)
+  store <384 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor7_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf2:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    leal (%rdx,%rdx,2), %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rdx,4), %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rdx,8), %eax
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $4, %ecx
-; AVX512F-ONLY-NEXT:    orl %eax, %ecx
-; AVX512F-ONLY-NEXT:    movl %edx, %eax
-; AVX512F-ONLY-NEXT:    shll $5, %eax
-; AVX512F-ONLY-NEXT:    orl %ecx, %eax
-; AVX512F-ONLY-NEXT:    shll $6, %edx
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $7, %r8d
-; AVX512F-ONLY-NEXT:    orl %edx, %r8d
-; AVX512F-ONLY-NEXT:    movl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $8, %edx
-; AVX512F-ONLY-NEXT:    orl %r8d, %edx
-; AVX512F-ONLY-NEXT:    movl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $9, %r8d
-; AVX512F-ONLY-NEXT:    orl %edx, %r8d
-; AVX512F-ONLY-NEXT:    movl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $10, %edx
-; AVX512F-ONLY-NEXT:    orl %r8d, %edx
-; AVX512F-ONLY-NEXT:    movl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $11, %r8d
-; AVX512F-ONLY-NEXT:    orl %edx, %r8d
-; AVX512F-ONLY-NEXT:    shll $12, %edi
-; AVX512F-ONLY-NEXT:    orl %r8d, %edi
-; AVX512F-ONLY-NEXT:    shll $13, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    orl %eax, %ecx
-; AVX512F-ONLY-NEXT:    andl $16383, %ecx # imm = 0x3FFF
-; AVX512F-ONLY-NEXT:    movw %cx, (%rsi)
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u>
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-ONLY-NEXT:    movw $16383, %ax # imm = 0x3FFF
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
+; AVX512F-ONLY-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512F-ONLY-NEXT:    vmovq %xmm1, 48(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor7_vf2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k1
-; AVX512DQ-NEXT:    kshiftrb $1, %k1, %k0
-; AVX512DQ-NEXT:    kmovw %k1, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    leal (%rdx,%rdx,2), %eax
-; AVX512DQ-NEXT:    leal (%rax,%rdx,4), %eax
-; AVX512DQ-NEXT:    leal (%rax,%rdx,8), %eax
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $4, %ecx
-; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    movl %edx, %eax
-; AVX512DQ-NEXT:    shll $5, %eax
-; AVX512DQ-NEXT:    orl %ecx, %eax
-; AVX512DQ-NEXT:    shll $6, %edx
-; AVX512DQ-NEXT:    kmovw %k0, %ecx
-; AVX512DQ-NEXT:    movl %ecx, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movl %edi, %r8d
-; AVX512DQ-NEXT:    shll $7, %r8d
-; AVX512DQ-NEXT:    orl %edx, %r8d
-; AVX512DQ-NEXT:    movl %edi, %edx
-; AVX512DQ-NEXT:    shll $8, %edx
-; AVX512DQ-NEXT:    orl %r8d, %edx
-; AVX512DQ-NEXT:    movl %edi, %r8d
-; AVX512DQ-NEXT:    shll $9, %r8d
-; AVX512DQ-NEXT:    orl %edx, %r8d
-; AVX512DQ-NEXT:    movl %edi, %edx
-; AVX512DQ-NEXT:    shll $10, %edx
-; AVX512DQ-NEXT:    orl %r8d, %edx
-; AVX512DQ-NEXT:    movl %edi, %r8d
-; AVX512DQ-NEXT:    shll $11, %r8d
-; AVX512DQ-NEXT:    orl %edx, %r8d
-; AVX512DQ-NEXT:    shll $12, %edi
-; AVX512DQ-NEXT:    orl %r8d, %edi
-; AVX512DQ-NEXT:    shll $13, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    andl $16383, %ecx # imm = 0x3FFF
-; AVX512DQ-NEXT:    movw %cx, (%rsi)
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u>
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    movw $16383, %ax # imm = 0x3FFF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512DQ-NEXT:    vmovq %xmm1, 48(%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor7_vf2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k1
-; AVX512BW-NEXT:    kshiftrw $1, %k1, %k0
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    leal (%rdx,%rdx,2), %eax
-; AVX512BW-NEXT:    leal (%rax,%rdx,4), %eax
-; AVX512BW-NEXT:    leal (%rax,%rdx,8), %eax
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $4, %ecx
-; AVX512BW-NEXT:    orl %eax, %ecx
-; AVX512BW-NEXT:    movl %edx, %eax
-; AVX512BW-NEXT:    shll $5, %eax
-; AVX512BW-NEXT:    orl %ecx, %eax
-; AVX512BW-NEXT:    shll $6, %edx
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    movl %ecx, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movl %edi, %r8d
-; AVX512BW-NEXT:    shll $7, %r8d
-; AVX512BW-NEXT:    orl %edx, %r8d
-; AVX512BW-NEXT:    movl %edi, %edx
-; AVX512BW-NEXT:    shll $8, %edx
-; AVX512BW-NEXT:    orl %r8d, %edx
-; AVX512BW-NEXT:    movl %edi, %r8d
-; AVX512BW-NEXT:    shll $9, %r8d
-; AVX512BW-NEXT:    orl %edx, %r8d
-; AVX512BW-NEXT:    movl %edi, %edx
-; AVX512BW-NEXT:    shll $10, %edx
-; AVX512BW-NEXT:    orl %r8d, %edx
-; AVX512BW-NEXT:    movl %edi, %r8d
-; AVX512BW-NEXT:    shll $11, %r8d
-; AVX512BW-NEXT:    orl %edx, %r8d
-; AVX512BW-NEXT:    shll $12, %edi
-; AVX512BW-NEXT:    orl %r8d, %edi
-; AVX512BW-NEXT:    shll $13, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    orl %eax, %ecx
-; AVX512BW-NEXT:    andl $16383, %ecx # imm = 0x3FFF
-; AVX512BW-NEXT:    movw %cx, (%rsi)
+; AVX512BW-NEXT:    kmovq (%rdi), %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u>
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512BW-NEXT:    movw $16383, %ax # imm = 0x3FFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rdx)
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm1, 48(%rdx)
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <2 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  store <14 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
+  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %data = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr %in.vec, i32 64, <14 x i1> %tgt.mask, <14 x i32> poison)
+  %data.padded = shufflevector <14 x i32> %data, <14 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef>
+  store <14 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor7_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf4:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k3
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k3, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    leal (%rdx,%rdx,2), %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rdx,4), %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%rdx,8), %eax
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $4, %ecx
-; AVX512F-ONLY-NEXT:    orl %eax, %ecx
-; AVX512F-ONLY-NEXT:    movl %edx, %eax
-; AVX512F-ONLY-NEXT:    shll $5, %eax
-; AVX512F-ONLY-NEXT:    orl %ecx, %eax
-; AVX512F-ONLY-NEXT:    shll $6, %edx
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edi
-; AVX512F-ONLY-NEXT:    shll $7, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $8, %edx
-; AVX512F-ONLY-NEXT:    orl %edi, %edx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edi
-; AVX512F-ONLY-NEXT:    shll $9, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $10, %edx
-; AVX512F-ONLY-NEXT:    orl %edi, %edx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edi
-; AVX512F-ONLY-NEXT:    shll $11, %edi
-; AVX512F-ONLY-NEXT:    orl %edx, %edi
-; AVX512F-ONLY-NEXT:    movl %ecx, %edx
-; AVX512F-ONLY-NEXT:    shll $12, %edx
-; AVX512F-ONLY-NEXT:    orl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $13, %ecx
-; AVX512F-ONLY-NEXT:    orl %edx, %ecx
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $14, %edi
-; AVX512F-ONLY-NEXT:    orl %ecx, %edi
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $15, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $16, %edi
-; AVX512F-ONLY-NEXT:    orl %ecx, %edi
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $17, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    movl %edx, %edi
-; AVX512F-ONLY-NEXT:    shll $18, %edi
-; AVX512F-ONLY-NEXT:    orl %ecx, %edi
-; AVX512F-ONLY-NEXT:    movl %edx, %ecx
-; AVX512F-ONLY-NEXT:    shll $19, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    shll $20, %edx
-; AVX512F-ONLY-NEXT:    orl %ecx, %edx
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
-; AVX512F-ONLY-NEXT:    movl %ecx, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $21, %r8d
-; AVX512F-ONLY-NEXT:    orl %edx, %r8d
-; AVX512F-ONLY-NEXT:    movl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $22, %edx
-; AVX512F-ONLY-NEXT:    orl %r8d, %edx
-; AVX512F-ONLY-NEXT:    movl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $23, %r8d
-; AVX512F-ONLY-NEXT:    orl %edx, %r8d
-; AVX512F-ONLY-NEXT:    movl %edi, %edx
-; AVX512F-ONLY-NEXT:    shll $24, %edx
-; AVX512F-ONLY-NEXT:    orl %r8d, %edx
-; AVX512F-ONLY-NEXT:    movl %edi, %r8d
-; AVX512F-ONLY-NEXT:    shll $25, %r8d
-; AVX512F-ONLY-NEXT:    orl %edx, %r8d
-; AVX512F-ONLY-NEXT:    shll $26, %edi
-; AVX512F-ONLY-NEXT:    orl %r8d, %edi
-; AVX512F-ONLY-NEXT:    shll $27, %ecx
-; AVX512F-ONLY-NEXT:    orl %edi, %ecx
-; AVX512F-ONLY-NEXT:    orl %eax, %ecx
-; AVX512F-ONLY-NEXT:    andl $268435455, %ecx # imm = 0xFFFFFFF
-; AVX512F-ONLY-NEXT:    movl %ecx, (%rsi)
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u>
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    movw $4095, %ax # imm = 0xFFF
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vextracti32x4 $2, %zmm0, 96(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa %ymm0, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor7_vf4:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k3
-; AVX512DQ-NEXT:    kshiftrb $3, %k3, %k0
-; AVX512DQ-NEXT:    kshiftrb $2, %k3, %k1
-; AVX512DQ-NEXT:    kshiftrb $1, %k3, %k2
-; AVX512DQ-NEXT:    kmovw %k3, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    leal (%rdx,%rdx,2), %eax
-; AVX512DQ-NEXT:    leal (%rax,%rdx,4), %eax
-; AVX512DQ-NEXT:    leal (%rax,%rdx,8), %eax
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $4, %ecx
-; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    movl %edx, %eax
-; AVX512DQ-NEXT:    shll $5, %eax
-; AVX512DQ-NEXT:    orl %ecx, %eax
-; AVX512DQ-NEXT:    shll $6, %edx
-; AVX512DQ-NEXT:    kmovw %k2, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movl %ecx, %edi
-; AVX512DQ-NEXT:    shll $7, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    shll $8, %edx
-; AVX512DQ-NEXT:    orl %edi, %edx
-; AVX512DQ-NEXT:    movl %ecx, %edi
-; AVX512DQ-NEXT:    shll $9, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    shll $10, %edx
-; AVX512DQ-NEXT:    orl %edi, %edx
-; AVX512DQ-NEXT:    movl %ecx, %edi
-; AVX512DQ-NEXT:    shll $11, %edi
-; AVX512DQ-NEXT:    orl %edx, %edi
-; AVX512DQ-NEXT:    movl %ecx, %edx
-; AVX512DQ-NEXT:    shll $12, %edx
-; AVX512DQ-NEXT:    orl %edi, %edx
-; AVX512DQ-NEXT:    shll $13, %ecx
-; AVX512DQ-NEXT:    orl %edx, %ecx
-; AVX512DQ-NEXT:    kmovw %k1, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $14, %edi
-; AVX512DQ-NEXT:    orl %ecx, %edi
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $15, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $16, %edi
-; AVX512DQ-NEXT:    orl %ecx, %edi
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $17, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    movl %edx, %edi
-; AVX512DQ-NEXT:    shll $18, %edi
-; AVX512DQ-NEXT:    orl %ecx, %edi
-; AVX512DQ-NEXT:    movl %edx, %ecx
-; AVX512DQ-NEXT:    shll $19, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    shll $20, %edx
-; AVX512DQ-NEXT:    orl %ecx, %edx
-; AVX512DQ-NEXT:    kmovw %k0, %ecx
-; AVX512DQ-NEXT:    movl %ecx, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movl %edi, %r8d
-; AVX512DQ-NEXT:    shll $21, %r8d
-; AVX512DQ-NEXT:    orl %edx, %r8d
-; AVX512DQ-NEXT:    movl %edi, %edx
-; AVX512DQ-NEXT:    shll $22, %edx
-; AVX512DQ-NEXT:    orl %r8d, %edx
-; AVX512DQ-NEXT:    movl %edi, %r8d
-; AVX512DQ-NEXT:    shll $23, %r8d
-; AVX512DQ-NEXT:    orl %edx, %r8d
-; AVX512DQ-NEXT:    movl %edi, %edx
-; AVX512DQ-NEXT:    shll $24, %edx
-; AVX512DQ-NEXT:    orl %r8d, %edx
-; AVX512DQ-NEXT:    movl %edi, %r8d
-; AVX512DQ-NEXT:    shll $25, %r8d
-; AVX512DQ-NEXT:    orl %edx, %r8d
-; AVX512DQ-NEXT:    shll $26, %edi
-; AVX512DQ-NEXT:    orl %r8d, %edi
-; AVX512DQ-NEXT:    shll $27, %ecx
-; AVX512DQ-NEXT:    orl %edi, %ecx
-; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    andl $268435455, %ecx # imm = 0xFFFFFFF
-; AVX512DQ-NEXT:    movl %ecx, (%rsi)
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u>
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQ-NEXT:    movw $4095, %ax # imm = 0xFFF
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, 96(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm0, 64(%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor7_vf4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k3
-; AVX512BW-NEXT:    kshiftrw $3, %k3, %k0
-; AVX512BW-NEXT:    kshiftrw $2, %k3, %k1
-; AVX512BW-NEXT:    kshiftrw $1, %k3, %k2
-; AVX512BW-NEXT:    kmovd %k3, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    leal (%rdx,%rdx,2), %eax
-; AVX512BW-NEXT:    leal (%rax,%rdx,4), %eax
-; AVX512BW-NEXT:    leal (%rax,%rdx,8), %eax
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $4, %ecx
-; AVX512BW-NEXT:    orl %eax, %ecx
-; AVX512BW-NEXT:    movl %edx, %eax
-; AVX512BW-NEXT:    shll $5, %eax
-; AVX512BW-NEXT:    orl %ecx, %eax
-; AVX512BW-NEXT:    shll $6, %edx
-; AVX512BW-NEXT:    kmovd %k2, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movl %ecx, %edi
-; AVX512BW-NEXT:    shll $7, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    shll $8, %edx
-; AVX512BW-NEXT:    orl %edi, %edx
-; AVX512BW-NEXT:    movl %ecx, %edi
-; AVX512BW-NEXT:    shll $9, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    shll $10, %edx
-; AVX512BW-NEXT:    orl %edi, %edx
-; AVX512BW-NEXT:    movl %ecx, %edi
-; AVX512BW-NEXT:    shll $11, %edi
-; AVX512BW-NEXT:    orl %edx, %edi
-; AVX512BW-NEXT:    movl %ecx, %edx
-; AVX512BW-NEXT:    shll $12, %edx
-; AVX512BW-NEXT:    orl %edi, %edx
-; AVX512BW-NEXT:    shll $13, %ecx
-; AVX512BW-NEXT:    orl %edx, %ecx
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $14, %edi
-; AVX512BW-NEXT:    orl %ecx, %edi
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $15, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $16, %edi
-; AVX512BW-NEXT:    orl %ecx, %edi
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $17, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    movl %edx, %edi
-; AVX512BW-NEXT:    shll $18, %edi
-; AVX512BW-NEXT:    orl %ecx, %edi
-; AVX512BW-NEXT:    movl %edx, %ecx
-; AVX512BW-NEXT:    shll $19, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    shll $20, %edx
-; AVX512BW-NEXT:    orl %ecx, %edx
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    movl %ecx, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movl %edi, %r8d
-; AVX512BW-NEXT:    shll $21, %r8d
-; AVX512BW-NEXT:    orl %edx, %r8d
-; AVX512BW-NEXT:    movl %edi, %edx
-; AVX512BW-NEXT:    shll $22, %edx
-; AVX512BW-NEXT:    orl %r8d, %edx
-; AVX512BW-NEXT:    movl %edi, %r8d
-; AVX512BW-NEXT:    shll $23, %r8d
-; AVX512BW-NEXT:    orl %edx, %r8d
-; AVX512BW-NEXT:    movl %edi, %edx
-; AVX512BW-NEXT:    shll $24, %edx
-; AVX512BW-NEXT:    orl %r8d, %edx
-; AVX512BW-NEXT:    movl %edi, %r8d
-; AVX512BW-NEXT:    shll $25, %r8d
-; AVX512BW-NEXT:    orl %edx, %r8d
-; AVX512BW-NEXT:    shll $26, %edi
-; AVX512BW-NEXT:    orl %r8d, %edi
-; AVX512BW-NEXT:    shll $27, %ecx
-; AVX512BW-NEXT:    orl %edi, %ecx
-; AVX512BW-NEXT:    orl %eax, %ecx
-; AVX512BW-NEXT:    andl $268435455, %ecx # imm = 0xFFFFFFF
-; AVX512BW-NEXT:    movl %ecx, (%rsi)
+; AVX512BW-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u>
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    movl $268435455, %eax # imm = 0xFFFFFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, 96(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vmovdqa %ymm1, 64(%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <4 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  store <28 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %data = call <28 x i32> @llvm.masked.load.v28i32.p0(ptr %in.vec, i32 64, <28 x i1> %tgt.mask, <28 x i32> poison)
+  %data.padded = shufflevector <28 x i32> %data, <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <28 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor7_vf8(ptr %in.vec, ptr %out.vec) nounwind {
-; AVX512F-ONLY-LABEL: mask_replication_factor7_vf8:
-; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k2
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k2, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k2, %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k2, %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k2, %k3
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k3
-; AVX512F-ONLY-NEXT:    kmovw %k3, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r9d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
-; AVX512F-ONLY-NEXT:    movzbl %dil, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    leaq (%r10,%r10,2), %rdi
-; AVX512F-ONLY-NEXT:    leaq (%rdi,%r10,4), %rdi
-; AVX512F-ONLY-NEXT:    leaq (%rdi,%r10,8), %rdi
-; AVX512F-ONLY-NEXT:    movq %r10, %r11
-; AVX512F-ONLY-NEXT:    shlq $4, %r11
-; AVX512F-ONLY-NEXT:    orq %rdi, %r11
-; AVX512F-ONLY-NEXT:    movq %r10, %rdi
-; AVX512F-ONLY-NEXT:    shlq $5, %rdi
-; AVX512F-ONLY-NEXT:    orq %r11, %rdi
-; AVX512F-ONLY-NEXT:    shlq $6, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $7, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $8, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $9, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $10, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $11, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $12, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    shlq $13, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %r10
-; AVX512F-ONLY-NEXT:    shlq $14, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $15, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %r8, %r10
-; AVX512F-ONLY-NEXT:    shlq $16, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $17, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %r8, %r10
-; AVX512F-ONLY-NEXT:    shlq $18, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $19, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    shlq $20, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    movzbl %dl, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %rdx
-; AVX512F-ONLY-NEXT:    shlq $21, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    movq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $22, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    movq %r9, %rdx
-; AVX512F-ONLY-NEXT:    shlq $23, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    movq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $24, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    movq %r9, %rdx
-; AVX512F-ONLY-NEXT:    shlq $25, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    movq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $26, %r8
-; AVX512F-ONLY-NEXT:    orq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $27, %r9
-; AVX512F-ONLY-NEXT:    orq %r8, %r9
-; AVX512F-ONLY-NEXT:    movzbl %cl, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $28, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    movq %rdx, %r9
-; AVX512F-ONLY-NEXT:    shlq $29, %r9
-; AVX512F-ONLY-NEXT:    orq %r8, %r9
-; AVX512F-ONLY-NEXT:    movq %rdx, %r8
-; AVX512F-ONLY-NEXT:    shlq $30, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    shlq $31, %rdx
-; AVX512F-ONLY-NEXT:    orq %r8, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movl %edx, (%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $32, %rdi
-; AVX512F-ONLY-NEXT:    movq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $33, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $34, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $35, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    shlq $36, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $37, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    shlq $38, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $39, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    shlq $40, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    shlq $41, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $42, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $43, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $44, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $45, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    movq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $46, %r8
-; AVX512F-ONLY-NEXT:    orq %rdi, %r8
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $47, %rdi
-; AVX512F-ONLY-NEXT:    orq %r8, %rdi
-; AVX512F-ONLY-NEXT:    shlq $48, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    movzbl %al, %edi
-; AVX512F-ONLY-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $49, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $50, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $51, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $52, %rcx
-; AVX512F-ONLY-NEXT:    orq %r8, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $53, %r8
-; AVX512F-ONLY-NEXT:    orq %rcx, %r8
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $55, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shrq $48, %rax
-; AVX512F-ONLY-NEXT:    movb %al, 6(%rsi)
-; AVX512F-ONLY-NEXT:    shrq $32, %rdi
-; AVX512F-ONLY-NEXT:    movw %di, 4(%rsi)
-; AVX512F-ONLY-NEXT:    retq
+define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-SLOW-LABEL: mask_replication_factor7_vf8:
+; AVX512F-SLOW:       # %bb.0:
+; AVX512F-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512F-SLOW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512F-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-SLOW-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-SLOW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512F-SLOW-NEXT:    movw $1, %ax
+; AVX512F-SLOW-NEXT:    kmovw %eax, %k2
+; AVX512F-SLOW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
+; AVX512F-SLOW-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512F-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-SLOW-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512F-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-SLOW-NEXT:    vptestmd %zmm0, %zmm0, %k4
+; AVX512F-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3]
+; AVX512F-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512F-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512F-SLOW-NEXT:    vmovdqa32 192(%rsi), %zmm1 {%k1} {z}
+; AVX512F-SLOW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k4} {z}
+; AVX512F-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k3} {z}
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, 192(%rdx)
+; AVX512F-SLOW-NEXT:    vzeroupper
+; AVX512F-SLOW-NEXT:    retq
 ;
-; AVX512DQ-LABEL: mask_replication_factor7_vf8:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k2
-; AVX512DQ-NEXT:    kshiftrb $7, %k2, %k0
-; AVX512DQ-NEXT:    kmovw %k0, %eax
-; AVX512DQ-NEXT:    kshiftrb $6, %k2, %k0
-; AVX512DQ-NEXT:    kshiftrb $5, %k2, %k1
-; AVX512DQ-NEXT:    kshiftrb $4, %k2, %k3
-; AVX512DQ-NEXT:    kmovw %k3, %ecx
-; AVX512DQ-NEXT:    kshiftrb $3, %k2, %k3
-; AVX512DQ-NEXT:    kmovw %k3, %edx
-; AVX512DQ-NEXT:    kshiftrb $2, %k2, %k3
-; AVX512DQ-NEXT:    kmovw %k3, %r8d
-; AVX512DQ-NEXT:    kshiftrb $1, %k2, %k3
-; AVX512DQ-NEXT:    kmovw %k3, %r9d
-; AVX512DQ-NEXT:    kmovw %k2, %edi
-; AVX512DQ-NEXT:    movzbl %dil, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    leaq (%r10,%r10,2), %rdi
-; AVX512DQ-NEXT:    leaq (%rdi,%r10,4), %rdi
-; AVX512DQ-NEXT:    leaq (%rdi,%r10,8), %rdi
-; AVX512DQ-NEXT:    movq %r10, %r11
-; AVX512DQ-NEXT:    shlq $4, %r11
-; AVX512DQ-NEXT:    orq %rdi, %r11
-; AVX512DQ-NEXT:    movq %r10, %rdi
-; AVX512DQ-NEXT:    shlq $5, %rdi
-; AVX512DQ-NEXT:    orq %r11, %rdi
-; AVX512DQ-NEXT:    shlq $6, %r10
-; AVX512DQ-NEXT:    movzbl %r9b, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $7, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $8, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $9, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $10, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $11, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $12, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    shlq $13, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movzbl %r8b, %r8d
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %r10
-; AVX512DQ-NEXT:    shlq $14, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $15, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %r8, %r10
-; AVX512DQ-NEXT:    shlq $16, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $17, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %r8, %r10
-; AVX512DQ-NEXT:    shlq $18, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $19, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    shlq $20, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    movzbl %dl, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %rdx
-; AVX512DQ-NEXT:    shlq $21, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    movq %r9, %r8
-; AVX512DQ-NEXT:    shlq $22, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    movq %r9, %rdx
-; AVX512DQ-NEXT:    shlq $23, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    movq %r9, %r8
-; AVX512DQ-NEXT:    shlq $24, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    movq %r9, %rdx
-; AVX512DQ-NEXT:    shlq $25, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    movq %r9, %r8
-; AVX512DQ-NEXT:    shlq $26, %r8
-; AVX512DQ-NEXT:    orq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $27, %r9
-; AVX512DQ-NEXT:    orq %r8, %r9
-; AVX512DQ-NEXT:    movzbl %cl, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $28, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    movq %rdx, %r9
-; AVX512DQ-NEXT:    shlq $29, %r9
-; AVX512DQ-NEXT:    orq %r8, %r9
-; AVX512DQ-NEXT:    movq %rdx, %r8
-; AVX512DQ-NEXT:    shlq $30, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    shlq $31, %rdx
-; AVX512DQ-NEXT:    orq %r8, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movl %edx, (%rsi)
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $32, %rdi
-; AVX512DQ-NEXT:    movq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $33, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $34, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    kmovw %k1, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $35, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rcx
-; AVX512DQ-NEXT:    shlq $36, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $37, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rcx
-; AVX512DQ-NEXT:    shlq $38, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $39, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rcx
-; AVX512DQ-NEXT:    shlq $40, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    shlq $41, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    kmovw %k0, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $42, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $43, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $44, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $45, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    movq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $46, %r8
-; AVX512DQ-NEXT:    orq %rdi, %r8
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $47, %rdi
-; AVX512DQ-NEXT:    orq %r8, %rdi
-; AVX512DQ-NEXT:    shlq $48, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    movzbl %al, %edi
-; AVX512DQ-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $49, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $50, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $51, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $52, %rcx
-; AVX512DQ-NEXT:    orq %r8, %rcx
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $53, %r8
-; AVX512DQ-NEXT:    orq %rcx, %r8
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    shlq $55, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shrq $48, %rax
-; AVX512DQ-NEXT:    movb %al, 6(%rsi)
-; AVX512DQ-NEXT:    shrq $32, %rdi
-; AVX512DQ-NEXT:    movw %di, 4(%rsi)
-; AVX512DQ-NEXT:    retq
+; AVX512F-FAST-LABEL: mask_replication_factor7_vf8:
+; AVX512F-FAST:       # %bb.0:
+; AVX512F-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512F-FAST-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-FAST-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-FAST-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512F-FAST-NEXT:    movw $1, %ax
+; AVX512F-FAST-NEXT:    kmovw %eax, %k2
+; AVX512F-FAST-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
+; AVX512F-FAST-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-FAST-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-FAST-NEXT:    vptestmd %zmm0, %zmm0, %k4
+; AVX512F-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7]
+; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512F-FAST-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512F-FAST-NEXT:    vmovdqa32 192(%rsi), %zmm1 {%k1} {z}
+; AVX512F-FAST-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k4} {z}
+; AVX512F-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k3} {z}
+; AVX512F-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512F-FAST-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512F-FAST-NEXT:    vmovdqa %ymm1, 192(%rdx)
+; AVX512F-FAST-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-FAST-NEXT:    vzeroupper
+; AVX512F-FAST-NEXT:    retq
 ;
-; AVX512BW-LABEL: mask_replication_factor7_vf8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k2
-; AVX512BW-NEXT:    kshiftrw $7, %k2, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    kshiftrw $6, %k2, %k0
-; AVX512BW-NEXT:    kshiftrw $5, %k2, %k1
-; AVX512BW-NEXT:    kshiftrw $4, %k2, %k3
-; AVX512BW-NEXT:    kmovd %k3, %ecx
-; AVX512BW-NEXT:    kshiftrw $3, %k2, %k3
-; AVX512BW-NEXT:    kmovd %k3, %edx
-; AVX512BW-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512BW-NEXT:    kmovd %k3, %r8d
-; AVX512BW-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512BW-NEXT:    kmovd %k3, %r9d
-; AVX512BW-NEXT:    kmovd %k2, %edi
-; AVX512BW-NEXT:    movzbl %dil, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    leaq (%r10,%r10,2), %rdi
-; AVX512BW-NEXT:    leaq (%rdi,%r10,4), %rdi
-; AVX512BW-NEXT:    leaq (%rdi,%r10,8), %rdi
-; AVX512BW-NEXT:    movq %r10, %r11
-; AVX512BW-NEXT:    shlq $4, %r11
-; AVX512BW-NEXT:    orq %rdi, %r11
-; AVX512BW-NEXT:    movq %r10, %rdi
-; AVX512BW-NEXT:    shlq $5, %rdi
-; AVX512BW-NEXT:    orq %r11, %rdi
-; AVX512BW-NEXT:    shlq $6, %r10
-; AVX512BW-NEXT:    movzbl %r9b, %r9d
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $7, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $8, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $9, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $10, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $11, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $12, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    shlq $13, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movzbl %r8b, %r8d
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %r10
-; AVX512BW-NEXT:    shlq $14, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $15, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movq %r8, %r10
-; AVX512BW-NEXT:    shlq $16, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $17, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movq %r8, %r10
-; AVX512BW-NEXT:    shlq $18, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $19, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    shlq $20, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    movzbl %dl, %r9d
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %rdx
-; AVX512BW-NEXT:    shlq $21, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    movq %r9, %r8
-; AVX512BW-NEXT:    shlq $22, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    movq %r9, %rdx
-; AVX512BW-NEXT:    shlq $23, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    movq %r9, %r8
-; AVX512BW-NEXT:    shlq $24, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    movq %r9, %rdx
-; AVX512BW-NEXT:    shlq $25, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    movq %r9, %r8
-; AVX512BW-NEXT:    shlq $26, %r8
-; AVX512BW-NEXT:    orq %rdx, %r8
-; AVX512BW-NEXT:    shlq $27, %r9
-; AVX512BW-NEXT:    orq %r8, %r9
-; AVX512BW-NEXT:    movzbl %cl, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    shlq $28, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    movq %rdx, %r9
-; AVX512BW-NEXT:    shlq $29, %r9
-; AVX512BW-NEXT:    orq %r8, %r9
-; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    shlq $30, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    shlq $31, %rdx
-; AVX512BW-NEXT:    orq %r8, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movl %edx, (%rsi)
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $32, %rdi
-; AVX512BW-NEXT:    movq %rcx, %r8
-; AVX512BW-NEXT:    shlq $33, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    shlq $34, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $35, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    movq %rdi, %rcx
-; AVX512BW-NEXT:    shlq $36, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $37, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    movq %rdi, %rcx
-; AVX512BW-NEXT:    shlq $38, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $39, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    movq %rdi, %rcx
-; AVX512BW-NEXT:    shlq $40, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    shlq $41, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %r8
-; AVX512BW-NEXT:    shlq $42, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $43, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    movq %rcx, %r8
-; AVX512BW-NEXT:    shlq $44, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $45, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    movq %rcx, %r8
-; AVX512BW-NEXT:    shlq $46, %r8
-; AVX512BW-NEXT:    orq %rdi, %r8
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $47, %rdi
-; AVX512BW-NEXT:    orq %r8, %rdi
-; AVX512BW-NEXT:    shlq $48, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    movzbl %al, %edi
-; AVX512BW-NEXT:    # kill: def $eax killed $eax def $rax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $49, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $50, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $51, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $52, %rcx
-; AVX512BW-NEXT:    orq %r8, %rcx
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $53, %r8
-; AVX512BW-NEXT:    orq %rcx, %r8
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    shlq $55, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shrq $48, %rax
-; AVX512BW-NEXT:    movb %al, 6(%rsi)
-; AVX512BW-NEXT:    shrq $32, %rdi
-; AVX512BW-NEXT:    movw %di, 4(%rsi)
-; AVX512BW-NEXT:    retq
-  %src.vec = load <8 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  store <56 x i1> %tgt.mask, ptr %out.vec, align 64
+; AVX512DQ-SLOW-LABEL: mask_replication_factor7_vf8:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-SLOW-NEXT:    vpmovm2d %k1, %zmm1
+; AVX512DQ-SLOW-NEXT:    movw $1, %ax
+; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm0, %k3
+; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3]
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %ymm0, %k4
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 192(%rsi), %zmm1 {%k4} {z}
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k3} {z}
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k2} {z}
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, 192(%rdx)
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: mask_replication_factor7_vf8:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-FAST-NEXT:    vpmovm2d %k1, %zmm1
+; AVX512DQ-FAST-NEXT:    movw $1, %ax
+; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm0, %k3
+; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vpmovd2m %ymm0, %k4
+; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-FAST-NEXT:    vmovdqa32 192(%rsi), %zmm1 {%k4} {z}
+; AVX512DQ-FAST-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k3} {z}
+; AVX512DQ-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k2} {z}
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, 192(%rdx)
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf8:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,u,u,u,u,u,u,u,u]
+; AVX512BW-ONLY-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-ONLY-NEXT:    movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
+; AVX512BW-ONLY-NEXT:    kmovq %rax, %k1
+; AVX512BW-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa %ymm0, 192(%rdx)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf8:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,u,u,u,u,u,u,u,u>
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI-ONLY-NEXT:    movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
+; AVX512VBMI-ONLY-NEXT:    kmovq %rax, %k1
+; AVX512VBMI-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa %ymm0, 192(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %data = call <56 x i32> @llvm.masked.load.v56i32.p0(ptr %in.vec, i32 64, <56 x i1> %tgt.mask, <56 x i32> poison)
+  %data.padded = shufflevector <56 x i32> %data, <56 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <56 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor7_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf16:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k4
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k4, %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k4, %k1
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k4, %k2
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k4, %k3
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k4, %k5
-; AVX512F-ONLY-NEXT:    kmovw %k5, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k4, %k5
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k4, %k6
-; AVX512F-ONLY-NEXT:    kmovw %k6, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k4, %k6
-; AVX512F-ONLY-NEXT:    kmovw %k6, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k4, %k6
-; AVX512F-ONLY-NEXT:    kmovw %k6, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k4, %k6
-; AVX512F-ONLY-NEXT:    kmovw %k6, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k4, %k6
-; AVX512F-ONLY-NEXT:    kmovw %k6, %eax
-; AVX512F-ONLY-NEXT:    movzbl %al, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, %r11d
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    leaq (%r11,%r11,2), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r11,4), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%r11,8), %r15
-; AVX512F-ONLY-NEXT:    movq %r11, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $5, %r11
-; AVX512F-ONLY-NEXT:    orq %r12, %r11
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r14
-; AVX512F-ONLY-NEXT:    shlq $6, %r14
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r12
-; AVX512F-ONLY-NEXT:    orq %r14, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r14
-; AVX512F-ONLY-NEXT:    shlq $8, %r14
-; AVX512F-ONLY-NEXT:    orq %r12, %r14
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $9, %r12
-; AVX512F-ONLY-NEXT:    orq %r14, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %r14
-; AVX512F-ONLY-NEXT:    shlq $10, %r14
-; AVX512F-ONLY-NEXT:    orq %r12, %r14
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $11, %r12
-; AVX512F-ONLY-NEXT:    orq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $12, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r14d
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $13, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %r15
-; AVX512F-ONLY-NEXT:    shlq $14, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $15, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %r15
-; AVX512F-ONLY-NEXT:    shlq $16, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $17, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %r15
-; AVX512F-ONLY-NEXT:    shlq $18, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $19, %r14
-; AVX512F-ONLY-NEXT:    orq %r15, %r14
-; AVX512F-ONLY-NEXT:    movzbl %bl, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rbx
-; AVX512F-ONLY-NEXT:    shlq $20, %rbx
-; AVX512F-ONLY-NEXT:    orq %r14, %rbx
-; AVX512F-ONLY-NEXT:    movq %r15, %r14
-; AVX512F-ONLY-NEXT:    shlq $21, %r14
-; AVX512F-ONLY-NEXT:    orq %rbx, %r14
-; AVX512F-ONLY-NEXT:    movq %r15, %rbx
-; AVX512F-ONLY-NEXT:    shlq $22, %rbx
-; AVX512F-ONLY-NEXT:    orq %r14, %rbx
-; AVX512F-ONLY-NEXT:    movq %r15, %r14
-; AVX512F-ONLY-NEXT:    shlq $23, %r14
-; AVX512F-ONLY-NEXT:    orq %rbx, %r14
-; AVX512F-ONLY-NEXT:    movq %r15, %rbx
-; AVX512F-ONLY-NEXT:    shlq $24, %rbx
-; AVX512F-ONLY-NEXT:    orq %r14, %rbx
-; AVX512F-ONLY-NEXT:    movq %r15, %r14
-; AVX512F-ONLY-NEXT:    shlq $25, %r14
-; AVX512F-ONLY-NEXT:    orq %rbx, %r14
-; AVX512F-ONLY-NEXT:    shlq $26, %r15
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    movq %rbx, %r14
-; AVX512F-ONLY-NEXT:    shlq $27, %r14
-; AVX512F-ONLY-NEXT:    orq %r15, %r14
-; AVX512F-ONLY-NEXT:    movq %rbx, %r15
-; AVX512F-ONLY-NEXT:    shlq $28, %r15
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    movq %rbx, %r14
-; AVX512F-ONLY-NEXT:    shlq $29, %r14
-; AVX512F-ONLY-NEXT:    orq %r15, %r14
-; AVX512F-ONLY-NEXT:    movq %rbx, %r15
-; AVX512F-ONLY-NEXT:    shlq $30, %r15
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    shlq $31, %rbx
-; AVX512F-ONLY-NEXT:    orq %r15, %rbx
-; AVX512F-ONLY-NEXT:    orq %r11, %rbx
-; AVX512F-ONLY-NEXT:    movl %ebx, 8(%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r11
-; AVX512F-ONLY-NEXT:    shlq $32, %r11
-; AVX512F-ONLY-NEXT:    shlq $33, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    kmovw %k5, %r11d
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    movq %r11, %r14
-; AVX512F-ONLY-NEXT:    shlq $34, %r14
-; AVX512F-ONLY-NEXT:    orq %r10, %r14
-; AVX512F-ONLY-NEXT:    movq %r11, %r10
-; AVX512F-ONLY-NEXT:    shlq $35, %r10
-; AVX512F-ONLY-NEXT:    orq %r14, %r10
-; AVX512F-ONLY-NEXT:    movq %r11, %r14
-; AVX512F-ONLY-NEXT:    shlq $36, %r14
-; AVX512F-ONLY-NEXT:    orq %r10, %r14
-; AVX512F-ONLY-NEXT:    movq %r11, %r10
-; AVX512F-ONLY-NEXT:    shlq $37, %r10
-; AVX512F-ONLY-NEXT:    orq %r14, %r10
-; AVX512F-ONLY-NEXT:    movq %r11, %r14
-; AVX512F-ONLY-NEXT:    shlq $38, %r14
-; AVX512F-ONLY-NEXT:    orq %r10, %r14
-; AVX512F-ONLY-NEXT:    movq %r11, %r10
-; AVX512F-ONLY-NEXT:    shlq $39, %r10
-; AVX512F-ONLY-NEXT:    orq %r14, %r10
-; AVX512F-ONLY-NEXT:    shlq $40, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r10d
-; AVX512F-ONLY-NEXT:    # kill: def $r8d killed $r8d def $r8
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %r14
-; AVX512F-ONLY-NEXT:    shlq $41, %r14
-; AVX512F-ONLY-NEXT:    orq %r11, %r14
-; AVX512F-ONLY-NEXT:    movq %r8, %r11
-; AVX512F-ONLY-NEXT:    shlq $42, %r11
-; AVX512F-ONLY-NEXT:    orq %r14, %r11
-; AVX512F-ONLY-NEXT:    movq %r8, %r14
-; AVX512F-ONLY-NEXT:    shlq $43, %r14
-; AVX512F-ONLY-NEXT:    orq %r11, %r14
-; AVX512F-ONLY-NEXT:    movq %r8, %r11
-; AVX512F-ONLY-NEXT:    shlq $44, %r11
-; AVX512F-ONLY-NEXT:    orq %r14, %r11
-; AVX512F-ONLY-NEXT:    movq %r8, %r14
-; AVX512F-ONLY-NEXT:    shlq $45, %r14
-; AVX512F-ONLY-NEXT:    orq %r11, %r14
-; AVX512F-ONLY-NEXT:    shlq $46, %r8
-; AVX512F-ONLY-NEXT:    orq %r14, %r8
-; AVX512F-ONLY-NEXT:    shlq $47, %r10
-; AVX512F-ONLY-NEXT:    orq %r8, %r10
-; AVX512F-ONLY-NEXT:    orq %rbx, %r10
-; AVX512F-ONLY-NEXT:    shrq $32, %r10
-; AVX512F-ONLY-NEXT:    movw %r10w, 12(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    leaq (%r10,%r10,2), %r8
-; AVX512F-ONLY-NEXT:    leaq (%r8,%r10,4), %r8
-; AVX512F-ONLY-NEXT:    leaq (%r8,%r10,8), %r8
-; AVX512F-ONLY-NEXT:    movq %r10, %r11
-; AVX512F-ONLY-NEXT:    shlq $4, %r11
-; AVX512F-ONLY-NEXT:    orq %r8, %r11
-; AVX512F-ONLY-NEXT:    movq %r10, %r8
-; AVX512F-ONLY-NEXT:    shlq $5, %r8
-; AVX512F-ONLY-NEXT:    orq %r11, %r8
-; AVX512F-ONLY-NEXT:    shlq $6, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $7, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $8, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $9, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $10, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    movq %r9, %r11
-; AVX512F-ONLY-NEXT:    shlq $11, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $12, %r10
-; AVX512F-ONLY-NEXT:    orq %r11, %r10
-; AVX512F-ONLY-NEXT:    shlq $13, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r10
-; AVX512F-ONLY-NEXT:    shlq $14, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %rdi, %r9
-; AVX512F-ONLY-NEXT:    shlq $15, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %rdi, %r10
-; AVX512F-ONLY-NEXT:    shlq $16, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %rdi, %r9
-; AVX512F-ONLY-NEXT:    shlq $17, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %rdi, %r10
-; AVX512F-ONLY-NEXT:    shlq $18, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %rdi, %r9
-; AVX512F-ONLY-NEXT:    shlq $19, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    shlq $20, %rdi
-; AVX512F-ONLY-NEXT:    orq %r9, %rdi
-; AVX512F-ONLY-NEXT:    movzbl %dl, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %r9
-; AVX512F-ONLY-NEXT:    shlq $21, %r9
-; AVX512F-ONLY-NEXT:    orq %rdi, %r9
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $22, %rdi
-; AVX512F-ONLY-NEXT:    orq %r9, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %r9
-; AVX512F-ONLY-NEXT:    shlq $23, %r9
-; AVX512F-ONLY-NEXT:    orq %rdi, %r9
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $24, %rdi
-; AVX512F-ONLY-NEXT:    orq %r9, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %r9
-; AVX512F-ONLY-NEXT:    shlq $25, %r9
-; AVX512F-ONLY-NEXT:    orq %rdi, %r9
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $26, %rdi
-; AVX512F-ONLY-NEXT:    orq %r9, %rdi
-; AVX512F-ONLY-NEXT:    shlq $27, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movzbl %cl, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r9
-; AVX512F-ONLY-NEXT:    shlq $28, %r9
-; AVX512F-ONLY-NEXT:    orq %rdx, %r9
-; AVX512F-ONLY-NEXT:    movq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    shlq $29, %rdx
-; AVX512F-ONLY-NEXT:    orq %r9, %rdx
-; AVX512F-ONLY-NEXT:    movq %rdi, %r9
-; AVX512F-ONLY-NEXT:    shlq $30, %r9
-; AVX512F-ONLY-NEXT:    orq %rdx, %r9
-; AVX512F-ONLY-NEXT:    shlq $31, %rdi
-; AVX512F-ONLY-NEXT:    orq %r9, %rdi
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $32, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $33, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $34, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    kmovw %k3, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $35, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $36, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $37, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $38, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $39, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $40, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    shlq $41, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $42, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $43, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $44, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $45, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $46, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $47, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    shlq $48, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
-; AVX512F-ONLY-NEXT:    andl $1, %edx
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $49, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $50, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $51, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $52, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $53, %rdi
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $54, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
-; AVX512F-ONLY-NEXT:    shlq $55, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $56, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $57, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $58, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $59, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $60, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $61, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    shlq $62, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    shlq $63, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rax, (%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k7
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm1 {%k7} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm2 {%k6} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm3 {%k5} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm4 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm5 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor7_vf16:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
-; AVX512DQ-NEXT:    kmovw (%rdi), %k4
-; AVX512DQ-NEXT:    kshiftrw $8, %k4, %k0
-; AVX512DQ-NEXT:    kshiftrw $7, %k4, %k1
-; AVX512DQ-NEXT:    kshiftrw $6, %k4, %k2
-; AVX512DQ-NEXT:    kshiftrw $5, %k4, %k3
-; AVX512DQ-NEXT:    kshiftrw $4, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %ecx
-; AVX512DQ-NEXT:    kshiftrw $3, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %edx
-; AVX512DQ-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %edi
-; AVX512DQ-NEXT:    kshiftrw $1, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %r9d
-; AVX512DQ-NEXT:    kshiftrw $15, %k4, %k5
-; AVX512DQ-NEXT:    kmovw %k5, %r8d
-; AVX512DQ-NEXT:    kshiftrw $14, %k4, %k5
-; AVX512DQ-NEXT:    kshiftrw $13, %k4, %k6
-; AVX512DQ-NEXT:    kmovw %k6, %r10d
-; AVX512DQ-NEXT:    kshiftrw $12, %k4, %k6
-; AVX512DQ-NEXT:    kmovw %k6, %ebx
-; AVX512DQ-NEXT:    kshiftrw $11, %k4, %k6
-; AVX512DQ-NEXT:    kmovw %k6, %ebp
-; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k6
-; AVX512DQ-NEXT:    kmovw %k6, %r14d
-; AVX512DQ-NEXT:    kshiftrw $9, %k4, %k6
-; AVX512DQ-NEXT:    kmovw %k6, %eax
-; AVX512DQ-NEXT:    movzbl %al, %eax
-; AVX512DQ-NEXT:    movl %eax, %r11d
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    leaq (%r11,%r11,2), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%r11,4), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%r11,8), %r15
-; AVX512DQ-NEXT:    movq %r11, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    shlq $5, %r11
-; AVX512DQ-NEXT:    orq %r12, %r11
-; AVX512DQ-NEXT:    movzbl %r14b, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r14
-; AVX512DQ-NEXT:    shlq $6, %r14
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $7, %r12
-; AVX512DQ-NEXT:    orq %r14, %r12
-; AVX512DQ-NEXT:    movq %r15, %r14
-; AVX512DQ-NEXT:    shlq $8, %r14
-; AVX512DQ-NEXT:    orq %r12, %r14
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $9, %r12
-; AVX512DQ-NEXT:    orq %r14, %r12
-; AVX512DQ-NEXT:    movq %r15, %r14
-; AVX512DQ-NEXT:    shlq $10, %r14
-; AVX512DQ-NEXT:    orq %r12, %r14
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $11, %r12
-; AVX512DQ-NEXT:    orq %r14, %r12
-; AVX512DQ-NEXT:    shlq $12, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %bpl, %r14d
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $13, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %r14, %r15
-; AVX512DQ-NEXT:    shlq $14, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $15, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %r14, %r15
-; AVX512DQ-NEXT:    shlq $16, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $17, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %r14, %r15
-; AVX512DQ-NEXT:    shlq $18, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    shlq $19, %r14
-; AVX512DQ-NEXT:    orq %r15, %r14
-; AVX512DQ-NEXT:    movzbl %bl, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rbx
-; AVX512DQ-NEXT:    shlq $20, %rbx
-; AVX512DQ-NEXT:    orq %r14, %rbx
-; AVX512DQ-NEXT:    movq %r15, %r14
-; AVX512DQ-NEXT:    shlq $21, %r14
-; AVX512DQ-NEXT:    orq %rbx, %r14
-; AVX512DQ-NEXT:    movq %r15, %rbx
-; AVX512DQ-NEXT:    shlq $22, %rbx
-; AVX512DQ-NEXT:    orq %r14, %rbx
-; AVX512DQ-NEXT:    movq %r15, %r14
-; AVX512DQ-NEXT:    shlq $23, %r14
-; AVX512DQ-NEXT:    orq %rbx, %r14
-; AVX512DQ-NEXT:    movq %r15, %rbx
-; AVX512DQ-NEXT:    shlq $24, %rbx
-; AVX512DQ-NEXT:    orq %r14, %rbx
-; AVX512DQ-NEXT:    movq %r15, %r14
-; AVX512DQ-NEXT:    shlq $25, %r14
-; AVX512DQ-NEXT:    orq %rbx, %r14
-; AVX512DQ-NEXT:    shlq $26, %r15
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    movzbl %r10b, %ebx
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    movq %rbx, %r14
-; AVX512DQ-NEXT:    shlq $27, %r14
-; AVX512DQ-NEXT:    orq %r15, %r14
-; AVX512DQ-NEXT:    movq %rbx, %r15
-; AVX512DQ-NEXT:    shlq $28, %r15
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    movq %rbx, %r14
-; AVX512DQ-NEXT:    shlq $29, %r14
-; AVX512DQ-NEXT:    orq %r15, %r14
-; AVX512DQ-NEXT:    movq %rbx, %r15
-; AVX512DQ-NEXT:    shlq $30, %r15
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    shlq $31, %rbx
-; AVX512DQ-NEXT:    orq %r15, %rbx
-; AVX512DQ-NEXT:    orq %r11, %rbx
-; AVX512DQ-NEXT:    movl %ebx, 8(%rsi)
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r11
-; AVX512DQ-NEXT:    shlq $32, %r11
-; AVX512DQ-NEXT:    shlq $33, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    kmovw %k5, %r11d
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    movq %r11, %r14
-; AVX512DQ-NEXT:    shlq $34, %r14
-; AVX512DQ-NEXT:    orq %r10, %r14
-; AVX512DQ-NEXT:    movq %r11, %r10
-; AVX512DQ-NEXT:    shlq $35, %r10
-; AVX512DQ-NEXT:    orq %r14, %r10
-; AVX512DQ-NEXT:    movq %r11, %r14
-; AVX512DQ-NEXT:    shlq $36, %r14
-; AVX512DQ-NEXT:    orq %r10, %r14
-; AVX512DQ-NEXT:    movq %r11, %r10
-; AVX512DQ-NEXT:    shlq $37, %r10
-; AVX512DQ-NEXT:    orq %r14, %r10
-; AVX512DQ-NEXT:    movq %r11, %r14
-; AVX512DQ-NEXT:    shlq $38, %r14
-; AVX512DQ-NEXT:    orq %r10, %r14
-; AVX512DQ-NEXT:    movq %r11, %r10
-; AVX512DQ-NEXT:    shlq $39, %r10
-; AVX512DQ-NEXT:    orq %r14, %r10
-; AVX512DQ-NEXT:    shlq $40, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movzbl %r8b, %r10d
-; AVX512DQ-NEXT:    # kill: def $r8d killed $r8d def $r8
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %r14
-; AVX512DQ-NEXT:    shlq $41, %r14
-; AVX512DQ-NEXT:    orq %r11, %r14
-; AVX512DQ-NEXT:    movq %r8, %r11
-; AVX512DQ-NEXT:    shlq $42, %r11
-; AVX512DQ-NEXT:    orq %r14, %r11
-; AVX512DQ-NEXT:    movq %r8, %r14
-; AVX512DQ-NEXT:    shlq $43, %r14
-; AVX512DQ-NEXT:    orq %r11, %r14
-; AVX512DQ-NEXT:    movq %r8, %r11
-; AVX512DQ-NEXT:    shlq $44, %r11
-; AVX512DQ-NEXT:    orq %r14, %r11
-; AVX512DQ-NEXT:    movq %r8, %r14
-; AVX512DQ-NEXT:    shlq $45, %r14
-; AVX512DQ-NEXT:    orq %r11, %r14
-; AVX512DQ-NEXT:    shlq $46, %r8
-; AVX512DQ-NEXT:    orq %r14, %r8
-; AVX512DQ-NEXT:    shlq $47, %r10
-; AVX512DQ-NEXT:    orq %r8, %r10
-; AVX512DQ-NEXT:    orq %rbx, %r10
-; AVX512DQ-NEXT:    shrq $32, %r10
-; AVX512DQ-NEXT:    movw %r10w, 12(%rsi)
-; AVX512DQ-NEXT:    kmovw %k4, %r8d
-; AVX512DQ-NEXT:    movzbl %r8b, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    leaq (%r10,%r10,2), %r8
-; AVX512DQ-NEXT:    leaq (%r8,%r10,4), %r8
-; AVX512DQ-NEXT:    leaq (%r8,%r10,8), %r8
-; AVX512DQ-NEXT:    movq %r10, %r11
-; AVX512DQ-NEXT:    shlq $4, %r11
-; AVX512DQ-NEXT:    orq %r8, %r11
-; AVX512DQ-NEXT:    movq %r10, %r8
-; AVX512DQ-NEXT:    shlq $5, %r8
-; AVX512DQ-NEXT:    orq %r11, %r8
-; AVX512DQ-NEXT:    shlq $6, %r10
-; AVX512DQ-NEXT:    movzbl %r9b, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $7, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $8, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $9, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $10, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    movq %r9, %r11
-; AVX512DQ-NEXT:    shlq $11, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r9, %r10
-; AVX512DQ-NEXT:    shlq $12, %r10
-; AVX512DQ-NEXT:    orq %r11, %r10
-; AVX512DQ-NEXT:    shlq $13, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movzbl %dil, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r10
-; AVX512DQ-NEXT:    shlq $14, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %rdi, %r9
-; AVX512DQ-NEXT:    shlq $15, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %rdi, %r10
-; AVX512DQ-NEXT:    shlq $16, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %rdi, %r9
-; AVX512DQ-NEXT:    shlq $17, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %rdi, %r10
-; AVX512DQ-NEXT:    shlq $18, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %rdi, %r9
-; AVX512DQ-NEXT:    shlq $19, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    shlq $20, %rdi
-; AVX512DQ-NEXT:    orq %r9, %rdi
-; AVX512DQ-NEXT:    movzbl %dl, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %r9
-; AVX512DQ-NEXT:    shlq $21, %r9
-; AVX512DQ-NEXT:    orq %rdi, %r9
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $22, %rdi
-; AVX512DQ-NEXT:    orq %r9, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %r9
-; AVX512DQ-NEXT:    shlq $23, %r9
-; AVX512DQ-NEXT:    orq %rdi, %r9
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $24, %rdi
-; AVX512DQ-NEXT:    orq %r9, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %r9
-; AVX512DQ-NEXT:    shlq $25, %r9
-; AVX512DQ-NEXT:    orq %rdi, %r9
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $26, %rdi
-; AVX512DQ-NEXT:    orq %r9, %rdi
-; AVX512DQ-NEXT:    shlq $27, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movzbl %cl, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r9
-; AVX512DQ-NEXT:    shlq $28, %r9
-; AVX512DQ-NEXT:    orq %rdx, %r9
-; AVX512DQ-NEXT:    movq %rdi, %rdx
-; AVX512DQ-NEXT:    shlq $29, %rdx
-; AVX512DQ-NEXT:    orq %r9, %rdx
-; AVX512DQ-NEXT:    movq %rdi, %r9
-; AVX512DQ-NEXT:    shlq $30, %r9
-; AVX512DQ-NEXT:    orq %rdx, %r9
-; AVX512DQ-NEXT:    shlq $31, %rdi
-; AVX512DQ-NEXT:    orq %r9, %rdi
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $32, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $33, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $34, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    kmovw %k3, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $35, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $36, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $37, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $38, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $39, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $40, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    shlq $41, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    kmovw %k2, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $42, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $43, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $44, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $45, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $46, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $47, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    shlq $48, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    kmovw %k1, %edx
-; AVX512DQ-NEXT:    andl $1, %edx
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $49, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $50, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $51, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $52, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    movq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $53, %rdi
-; AVX512DQ-NEXT:    orq %rcx, %rdi
-; AVX512DQ-NEXT:    movq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $54, %rcx
-; AVX512DQ-NEXT:    orq %rdi, %rcx
-; AVX512DQ-NEXT:    shlq $55, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    kmovw %k0, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $56, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $57, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $58, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $59, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rdi
-; AVX512DQ-NEXT:    shlq $60, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $61, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    shlq $62, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    shlq $63, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rax, (%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k7
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm1 {%k7} {z}
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm2 {%k6} {z}
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm3 {%k5} {z}
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm4 {%k4} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm5 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
-; AVX512BW-LABEL: mask_replication_factor7_vf16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovw (%rdi), %k4
-; AVX512BW-NEXT:    kshiftrw $8, %k4, %k0
-; AVX512BW-NEXT:    kshiftrw $7, %k4, %k1
-; AVX512BW-NEXT:    kshiftrw $6, %k4, %k2
-; AVX512BW-NEXT:    kshiftrw $5, %k4, %k3
-; AVX512BW-NEXT:    kshiftrw $4, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %ecx
-; AVX512BW-NEXT:    kshiftrw $3, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %edx
-; AVX512BW-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %edi
-; AVX512BW-NEXT:    kshiftrw $1, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %r9d
-; AVX512BW-NEXT:    kshiftrw $15, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %r8d
-; AVX512BW-NEXT:    kshiftrw $14, %k4, %k5
-; AVX512BW-NEXT:    kshiftrw $13, %k4, %k6
-; AVX512BW-NEXT:    kmovd %k6, %r10d
-; AVX512BW-NEXT:    kshiftrw $12, %k4, %k6
-; AVX512BW-NEXT:    kmovd %k6, %ebx
-; AVX512BW-NEXT:    kshiftrw $11, %k4, %k6
-; AVX512BW-NEXT:    kmovd %k6, %ebp
-; AVX512BW-NEXT:    kshiftrw $10, %k4, %k6
-; AVX512BW-NEXT:    kmovd %k6, %r14d
-; AVX512BW-NEXT:    kshiftrw $9, %k4, %k6
-; AVX512BW-NEXT:    kmovd %k6, %eax
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movl %eax, %r11d
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    leaq (%r11,%r11,2), %r15
-; AVX512BW-NEXT:    leaq (%r15,%r11,4), %r15
-; AVX512BW-NEXT:    leaq (%r15,%r11,8), %r15
-; AVX512BW-NEXT:    movq %r11, %r12
-; AVX512BW-NEXT:    shlq $4, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    shlq $5, %r11
-; AVX512BW-NEXT:    orq %r12, %r11
-; AVX512BW-NEXT:    movzbl %r14b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r14
-; AVX512BW-NEXT:    shlq $6, %r14
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    movq %r15, %r14
-; AVX512BW-NEXT:    shlq $8, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    movq %r15, %r14
-; AVX512BW-NEXT:    shlq $10, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $11, %r12
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    shlq $12, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %bpl, %r14d
-; AVX512BW-NEXT:    andl $1, %r14d
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $13, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movq %r14, %r15
-; AVX512BW-NEXT:    shlq $14, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $15, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movq %r14, %r15
-; AVX512BW-NEXT:    shlq $16, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movq %r14, %r12
-; AVX512BW-NEXT:    shlq $17, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movq %r14, %r15
-; AVX512BW-NEXT:    shlq $18, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    shlq $19, %r14
-; AVX512BW-NEXT:    orq %r15, %r14
-; AVX512BW-NEXT:    movzbl %bl, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rbx
-; AVX512BW-NEXT:    shlq $20, %rbx
-; AVX512BW-NEXT:    orq %r14, %rbx
-; AVX512BW-NEXT:    movq %r15, %r14
-; AVX512BW-NEXT:    shlq $21, %r14
-; AVX512BW-NEXT:    orq %rbx, %r14
-; AVX512BW-NEXT:    movq %r15, %rbx
-; AVX512BW-NEXT:    shlq $22, %rbx
-; AVX512BW-NEXT:    orq %r14, %rbx
-; AVX512BW-NEXT:    movq %r15, %r14
-; AVX512BW-NEXT:    shlq $23, %r14
-; AVX512BW-NEXT:    orq %rbx, %r14
-; AVX512BW-NEXT:    movq %r15, %rbx
-; AVX512BW-NEXT:    shlq $24, %rbx
-; AVX512BW-NEXT:    orq %r14, %rbx
-; AVX512BW-NEXT:    movq %r15, %r14
-; AVX512BW-NEXT:    shlq $25, %r14
-; AVX512BW-NEXT:    orq %rbx, %r14
-; AVX512BW-NEXT:    shlq $26, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    movzbl %r10b, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %r14
-; AVX512BW-NEXT:    shlq $27, %r14
-; AVX512BW-NEXT:    orq %r15, %r14
-; AVX512BW-NEXT:    movq %rbx, %r15
-; AVX512BW-NEXT:    shlq $28, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    movq %rbx, %r14
-; AVX512BW-NEXT:    shlq $29, %r14
-; AVX512BW-NEXT:    orq %r15, %r14
-; AVX512BW-NEXT:    movq %rbx, %r15
-; AVX512BW-NEXT:    shlq $30, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    shlq $31, %rbx
-; AVX512BW-NEXT:    orq %r15, %rbx
-; AVX512BW-NEXT:    orq %r11, %rbx
-; AVX512BW-NEXT:    movl %ebx, 8(%rsi)
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %r11
-; AVX512BW-NEXT:    shlq $32, %r11
-; AVX512BW-NEXT:    shlq $33, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    kmovd %k5, %r11d
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %r14
-; AVX512BW-NEXT:    shlq $34, %r14
-; AVX512BW-NEXT:    orq %r10, %r14
-; AVX512BW-NEXT:    movq %r11, %r10
-; AVX512BW-NEXT:    shlq $35, %r10
-; AVX512BW-NEXT:    orq %r14, %r10
-; AVX512BW-NEXT:    movq %r11, %r14
-; AVX512BW-NEXT:    shlq $36, %r14
-; AVX512BW-NEXT:    orq %r10, %r14
-; AVX512BW-NEXT:    movq %r11, %r10
-; AVX512BW-NEXT:    shlq $37, %r10
-; AVX512BW-NEXT:    orq %r14, %r10
-; AVX512BW-NEXT:    movq %r11, %r14
-; AVX512BW-NEXT:    shlq $38, %r14
-; AVX512BW-NEXT:    orq %r10, %r14
-; AVX512BW-NEXT:    movq %r11, %r10
-; AVX512BW-NEXT:    shlq $39, %r10
-; AVX512BW-NEXT:    orq %r14, %r10
-; AVX512BW-NEXT:    shlq $40, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movzbl %r8b, %r10d
-; AVX512BW-NEXT:    # kill: def $r8d killed $r8d def $r8
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %r14
-; AVX512BW-NEXT:    shlq $41, %r14
-; AVX512BW-NEXT:    orq %r11, %r14
-; AVX512BW-NEXT:    movq %r8, %r11
-; AVX512BW-NEXT:    shlq $42, %r11
-; AVX512BW-NEXT:    orq %r14, %r11
-; AVX512BW-NEXT:    movq %r8, %r14
-; AVX512BW-NEXT:    shlq $43, %r14
-; AVX512BW-NEXT:    orq %r11, %r14
-; AVX512BW-NEXT:    movq %r8, %r11
-; AVX512BW-NEXT:    shlq $44, %r11
-; AVX512BW-NEXT:    orq %r14, %r11
-; AVX512BW-NEXT:    movq %r8, %r14
-; AVX512BW-NEXT:    shlq $45, %r14
-; AVX512BW-NEXT:    orq %r11, %r14
-; AVX512BW-NEXT:    shlq $46, %r8
-; AVX512BW-NEXT:    orq %r14, %r8
-; AVX512BW-NEXT:    shlq $47, %r10
-; AVX512BW-NEXT:    orq %r8, %r10
-; AVX512BW-NEXT:    orq %rbx, %r10
-; AVX512BW-NEXT:    shrq $32, %r10
-; AVX512BW-NEXT:    movw %r10w, 12(%rsi)
-; AVX512BW-NEXT:    kmovd %k4, %r8d
-; AVX512BW-NEXT:    movzbl %r8b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    leaq (%r10,%r10,2), %r8
-; AVX512BW-NEXT:    leaq (%r8,%r10,4), %r8
-; AVX512BW-NEXT:    leaq (%r8,%r10,8), %r8
-; AVX512BW-NEXT:    movq %r10, %r11
-; AVX512BW-NEXT:    shlq $4, %r11
-; AVX512BW-NEXT:    orq %r8, %r11
-; AVX512BW-NEXT:    movq %r10, %r8
-; AVX512BW-NEXT:    shlq $5, %r8
-; AVX512BW-NEXT:    orq %r11, %r8
-; AVX512BW-NEXT:    shlq $6, %r10
-; AVX512BW-NEXT:    movzbl %r9b, %r9d
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $7, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $8, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $9, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $10, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    movq %r9, %r11
-; AVX512BW-NEXT:    shlq $11, %r11
-; AVX512BW-NEXT:    orq %r10, %r11
-; AVX512BW-NEXT:    movq %r9, %r10
-; AVX512BW-NEXT:    shlq $12, %r10
-; AVX512BW-NEXT:    orq %r11, %r10
-; AVX512BW-NEXT:    shlq $13, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movzbl %dil, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r10
-; AVX512BW-NEXT:    shlq $14, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %rdi, %r9
-; AVX512BW-NEXT:    shlq $15, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movq %rdi, %r10
-; AVX512BW-NEXT:    shlq $16, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %rdi, %r9
-; AVX512BW-NEXT:    shlq $17, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    movq %rdi, %r10
-; AVX512BW-NEXT:    shlq $18, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    movq %rdi, %r9
-; AVX512BW-NEXT:    shlq $19, %r9
-; AVX512BW-NEXT:    orq %r10, %r9
-; AVX512BW-NEXT:    shlq $20, %rdi
-; AVX512BW-NEXT:    orq %r9, %rdi
-; AVX512BW-NEXT:    movzbl %dl, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %r9
-; AVX512BW-NEXT:    shlq $21, %r9
-; AVX512BW-NEXT:    orq %rdi, %r9
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $22, %rdi
-; AVX512BW-NEXT:    orq %r9, %rdi
-; AVX512BW-NEXT:    movq %rdx, %r9
-; AVX512BW-NEXT:    shlq $23, %r9
-; AVX512BW-NEXT:    orq %rdi, %r9
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $24, %rdi
-; AVX512BW-NEXT:    orq %r9, %rdi
-; AVX512BW-NEXT:    movq %rdx, %r9
-; AVX512BW-NEXT:    shlq $25, %r9
-; AVX512BW-NEXT:    orq %rdi, %r9
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $26, %rdi
-; AVX512BW-NEXT:    orq %r9, %rdi
-; AVX512BW-NEXT:    shlq $27, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movzbl %cl, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r9
-; AVX512BW-NEXT:    shlq $28, %r9
-; AVX512BW-NEXT:    orq %rdx, %r9
-; AVX512BW-NEXT:    movq %rdi, %rdx
-; AVX512BW-NEXT:    shlq $29, %rdx
-; AVX512BW-NEXT:    orq %r9, %rdx
-; AVX512BW-NEXT:    movq %rdi, %r9
-; AVX512BW-NEXT:    shlq $30, %r9
-; AVX512BW-NEXT:    orq %rdx, %r9
-; AVX512BW-NEXT:    shlq $31, %rdi
-; AVX512BW-NEXT:    orq %r9, %rdi
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $32, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $33, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $34, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    kmovd %k3, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $35, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rcx
-; AVX512BW-NEXT:    shlq $36, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $37, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rcx
-; AVX512BW-NEXT:    shlq $38, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $39, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rcx
-; AVX512BW-NEXT:    shlq $40, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    shlq $41, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    kmovd %k2, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $42, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $43, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $44, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $45, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $46, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $47, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    shlq $48, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $49, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rcx
-; AVX512BW-NEXT:    shlq $50, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $51, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rcx
-; AVX512BW-NEXT:    shlq $52, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $53, %rdi
-; AVX512BW-NEXT:    orq %rcx, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rcx
-; AVX512BW-NEXT:    shlq $54, %rcx
-; AVX512BW-NEXT:    orq %rdi, %rcx
-; AVX512BW-NEXT:    shlq $55, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $56, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $57, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $58, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $59, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rdi
-; AVX512BW-NEXT:    shlq $60, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $61, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    shlq $62, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    shlq $63, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rax, (%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    retq
-  %src.vec = load <16 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-  store <112 x i1> %tgt.mask, ptr %out.vec, align 64
+; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf16:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k2
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k2, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,45,45,46,46,46,46,46,46,46,47,47,47,47,47,47,47,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512BW-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512BW-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
+; AVX512BW-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512BW-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm3 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
+; AVX512BW-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm5
+; AVX512BW-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512BW-ONLY-NEXT:    vpermd %zmm0, %zmm6, %zmm6
+; AVX512BW-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512BW-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm0
+; AVX512BW-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm5, 192(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm4, 256(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 320(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf16:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k2
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k2, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13,13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512VBMI-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512VBMI-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
+; AVX512VBMI-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512VBMI-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm3 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
+; AVX512VBMI-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm5
+; AVX512VBMI-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm5 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512VBMI-ONLY-NEXT:    vpermd %zmm0, %zmm6, %zmm6
+; AVX512VBMI-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512VBMI-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm5, 192(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm4, 256(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 320(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %data = call <112 x i32> @llvm.masked.load.v112i32.p0(ptr %in.vec, i32 64, <112 x i1> %tgt.mask, <112 x i32> poison)
+  store <112 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor7_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf32:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r13
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k2
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k2
-; AVX512F-ONLY-NEXT:    movl %r8d, %r9d
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    leal (%r9,%r9,2), %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%r9,4), %eax
-; AVX512F-ONLY-NEXT:    leal (%rax,%r9,8), %eax
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    shll $4, %r12d
-; AVX512F-ONLY-NEXT:    orl %eax, %r12d
-; AVX512F-ONLY-NEXT:    movl %r15d, %ebx
-; AVX512F-ONLY-NEXT:    shll $5, %ebx
-; AVX512F-ONLY-NEXT:    orl %r12d, %ebx
-; AVX512F-ONLY-NEXT:    movl %r15d, %eax
-; AVX512F-ONLY-NEXT:    shll $6, %eax
-; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    shll $7, %r12d
-; AVX512F-ONLY-NEXT:    orl %eax, %r12d
-; AVX512F-ONLY-NEXT:    movl %r15d, %eax
-; AVX512F-ONLY-NEXT:    shll $8, %eax
-; AVX512F-ONLY-NEXT:    orl %r12d, %eax
-; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
-; AVX512F-ONLY-NEXT:    shll $9, %r13d
-; AVX512F-ONLY-NEXT:    orl %eax, %r13d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k2
-; AVX512F-ONLY-NEXT:    shll $10, %r15d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movl %r10d, %eax
-; AVX512F-ONLY-NEXT:    shll $11, %eax
-; AVX512F-ONLY-NEXT:    orl %r15d, %eax
-; AVX512F-ONLY-NEXT:    movl %r10d, %r15d
-; AVX512F-ONLY-NEXT:    shll $12, %r15d
-; AVX512F-ONLY-NEXT:    orl %eax, %r15d
-; AVX512F-ONLY-NEXT:    movl %r10d, %eax
-; AVX512F-ONLY-NEXT:    shll $13, %eax
-; AVX512F-ONLY-NEXT:    orl %r15d, %eax
-; AVX512F-ONLY-NEXT:    movl %r10d, %r15d
-; AVX512F-ONLY-NEXT:    shll $14, %r15d
-; AVX512F-ONLY-NEXT:    orl %eax, %r15d
-; AVX512F-ONLY-NEXT:    movl %r10d, %eax
-; AVX512F-ONLY-NEXT:    shll $15, %eax
-; AVX512F-ONLY-NEXT:    orl %r15d, %eax
-; AVX512F-ONLY-NEXT:    movl %r10d, %r15d
-; AVX512F-ONLY-NEXT:    shll $16, %r15d
-; AVX512F-ONLY-NEXT:    orl %eax, %r15d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512F-ONLY-NEXT:    shll $17, %r10d
-; AVX512F-ONLY-NEXT:    orl %r15d, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
-; AVX512F-ONLY-NEXT:    shll $18, %r15d
-; AVX512F-ONLY-NEXT:    orl %r10d, %r15d
-; AVX512F-ONLY-NEXT:    movl %r12d, %r10d
-; AVX512F-ONLY-NEXT:    shll $19, %r10d
-; AVX512F-ONLY-NEXT:    orl %r15d, %r10d
-; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
-; AVX512F-ONLY-NEXT:    shll $20, %r15d
-; AVX512F-ONLY-NEXT:    orl %r10d, %r15d
-; AVX512F-ONLY-NEXT:    movl %r12d, %r10d
-; AVX512F-ONLY-NEXT:    shll $21, %r10d
-; AVX512F-ONLY-NEXT:    orl %r15d, %r10d
-; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
-; AVX512F-ONLY-NEXT:    shll $22, %r15d
-; AVX512F-ONLY-NEXT:    orl %r10d, %r15d
-; AVX512F-ONLY-NEXT:    movl %r12d, %r10d
-; AVX512F-ONLY-NEXT:    shll $23, %r10d
-; AVX512F-ONLY-NEXT:    orl %r15d, %r10d
-; AVX512F-ONLY-NEXT:    shll $24, %r12d
-; AVX512F-ONLY-NEXT:    orl %r10d, %r12d
-; AVX512F-ONLY-NEXT:    movl %eax, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r10d
-; AVX512F-ONLY-NEXT:    shll $25, %r10d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r10d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    shll $26, %r12d
-; AVX512F-ONLY-NEXT:    orl %r10d, %r12d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r10d
-; AVX512F-ONLY-NEXT:    shll $27, %r10d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r10d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
-; AVX512F-ONLY-NEXT:    shll $28, %r12d
-; AVX512F-ONLY-NEXT:    orl %r10d, %r12d
-; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
-; AVX512F-ONLY-NEXT:    shll $29, %r13d
-; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512F-ONLY-NEXT:    shll $30, %r15d
-; AVX512F-ONLY-NEXT:    orl %r13d, %r15d
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512F-ONLY-NEXT:    shll $31, %eax
-; AVX512F-ONLY-NEXT:    orl %r15d, %eax
-; AVX512F-ONLY-NEXT:    orl %ebx, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, 24(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
-; AVX512F-ONLY-NEXT:    movzbl %al, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %rbx
-; AVX512F-ONLY-NEXT:    leaq (%rbx,%rax,4), %rbx
-; AVX512F-ONLY-NEXT:    leaq (%rbx,%rax,8), %rbx
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $4, %r15
-; AVX512F-ONLY-NEXT:    orq %rbx, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $5, %rbx
-; AVX512F-ONLY-NEXT:    orq %r15, %rbx
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $9, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $11, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $13, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $15, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $17, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $19, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $20, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $21, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $22, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $24, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $25, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $26, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $27, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $28, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $29, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $30, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $32, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $34, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $36, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $37, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $38, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $39, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $40, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $43, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $45, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r13
-; AVX512F-ONLY-NEXT:    shlq $47, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $48, %r10
-; AVX512F-ONLY-NEXT:    orq %r13, %r10
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $50, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $51, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $52, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $53, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $54, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k2
-; AVX512F-ONLY-NEXT:    shlq $55, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $56, %r10
-; AVX512F-ONLY-NEXT:    orq %r12, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $57, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $58, %r10
-; AVX512F-ONLY-NEXT:    orq %r12, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $59, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $60, %r10
-; AVX512F-ONLY-NEXT:    orq %r12, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $62, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r15
-; AVX512F-ONLY-NEXT:    shlq $63, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    orq %rbx, %r15
-; AVX512F-ONLY-NEXT:    movq %r15, (%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %r11d
-; AVX512F-ONLY-NEXT:    movl %r11d, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    leaq (%rbx,%rbx,2), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,4), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,8), %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $5, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $9, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k2
-; AVX512F-ONLY-NEXT:    shlq $11, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $13, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $15, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $17, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k2
-; AVX512F-ONLY-NEXT:    shlq $18, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $19, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $20, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $21, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $22, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $24, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $25, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $26, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $27, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $28, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $29, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $30, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k2
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k2
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    shlq $32, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $34, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $36, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $37, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $38, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k2
-; AVX512F-ONLY-NEXT:    shlq $39, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $43, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r13
-; AVX512F-ONLY-NEXT:    shlq $45, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k2
-; AVX512F-ONLY-NEXT:    shlq $46, %r14
-; AVX512F-ONLY-NEXT:    orq %r13, %r14
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $48, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $50, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $51, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $52, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512F-ONLY-NEXT:    shlq $53, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $55, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $57, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $58, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %r12
-; AVX512F-ONLY-NEXT:    shlq $59, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
-; AVX512F-ONLY-NEXT:    shlq $60, %r14
-; AVX512F-ONLY-NEXT:    orq %r12, %r14
-; AVX512F-ONLY-NEXT:    movq %r9, %r12
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %r14, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r14d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k0
-; AVX512F-ONLY-NEXT:    shlq $62, %r9
-; AVX512F-ONLY-NEXT:    orq %r12, %r9
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r12d
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %eax
-; AVX512F-ONLY-NEXT:    shlq $63, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %rax, 16(%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    leaq (%r10,%r10,2), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r10,4), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r10,8), %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r8
-; AVX512F-ONLY-NEXT:    shlq $4, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $5, %r10
-; AVX512F-ONLY-NEXT:    orq %r8, %r10
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $6, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $7, %r9
-; AVX512F-ONLY-NEXT:    orq %r8, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $8, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $9, %r9
-; AVX512F-ONLY-NEXT:    orq %r8, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $10, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $11, %r9
-; AVX512F-ONLY-NEXT:    orq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $13, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $15, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %r8
-; AVX512F-ONLY-NEXT:    shlq $17, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $19, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    movzbl %dl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $20, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $21, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $22, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $23, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $24, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $25, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $26, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movzbl %cl, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $27, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $28, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $29, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $30, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $31, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $35, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $37, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $39, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $40, %r13
-; AVX512F-ONLY-NEXT:    orq %rcx, %r13
-; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $41, %rcx
-; AVX512F-ONLY-NEXT:    orq %r13, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $42, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $43, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $44, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $45, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $46, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r14d
-; AVX512F-ONLY-NEXT:    movq %r14, %rcx
-; AVX512F-ONLY-NEXT:    shlq $48, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rcx
-; AVX512F-ONLY-NEXT:    shlq $50, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $51, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r14, %rcx
-; AVX512F-ONLY-NEXT:    shlq $52, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r14, %rax
-; AVX512F-ONLY-NEXT:    shlq $53, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %r14
-; AVX512F-ONLY-NEXT:    orq %rax, %r14
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %r14, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $56, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $58, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $59, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $60, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %rcx, %r12
-; AVX512F-ONLY-NEXT:    shlq $62, %rbx
-; AVX512F-ONLY-NEXT:    orq %r12, %rbx
-; AVX512F-ONLY-NEXT:    shlq $63, %r11
-; AVX512F-ONLY-NEXT:    orq %rbx, %r11
-; AVX512F-ONLY-NEXT:    orq %r10, %r11
-; AVX512F-ONLY-NEXT:    movq %r11, 8(%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r13
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm4
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm10
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm12
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm0
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm2
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm5, %zmm5
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm7
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm9, %zmm9
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm11, %zmm11
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm13, %zmm3
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm13 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm11 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm9 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm12 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 512(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 576(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 640(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 704(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 768(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 832(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor7_vf32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r13
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
-; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
-; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %ecx
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %edx
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %edi
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r8d
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r11d
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k2
-; AVX512DQ-NEXT:    kmovw %k2, %r10d
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k2
-; AVX512DQ-NEXT:    movl %r8d, %r9d
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    leal (%r9,%r9,2), %eax
-; AVX512DQ-NEXT:    leal (%rax,%r9,4), %eax
-; AVX512DQ-NEXT:    leal (%rax,%r9,8), %eax
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movl %r15d, %r12d
-; AVX512DQ-NEXT:    shll $4, %r12d
-; AVX512DQ-NEXT:    orl %eax, %r12d
-; AVX512DQ-NEXT:    movl %r15d, %ebx
-; AVX512DQ-NEXT:    shll $5, %ebx
-; AVX512DQ-NEXT:    orl %r12d, %ebx
-; AVX512DQ-NEXT:    movl %r15d, %eax
-; AVX512DQ-NEXT:    shll $6, %eax
-; AVX512DQ-NEXT:    movl %r15d, %r12d
-; AVX512DQ-NEXT:    shll $7, %r12d
-; AVX512DQ-NEXT:    orl %eax, %r12d
-; AVX512DQ-NEXT:    movl %r15d, %eax
-; AVX512DQ-NEXT:    shll $8, %eax
-; AVX512DQ-NEXT:    orl %r12d, %eax
-; AVX512DQ-NEXT:    movl %r15d, %r13d
-; AVX512DQ-NEXT:    shll $9, %r13d
-; AVX512DQ-NEXT:    orl %eax, %r13d
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k2
-; AVX512DQ-NEXT:    shll $10, %r15d
-; AVX512DQ-NEXT:    orl %r13d, %r15d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movl %r10d, %eax
-; AVX512DQ-NEXT:    shll $11, %eax
-; AVX512DQ-NEXT:    orl %r15d, %eax
-; AVX512DQ-NEXT:    movl %r10d, %r15d
-; AVX512DQ-NEXT:    shll $12, %r15d
-; AVX512DQ-NEXT:    orl %eax, %r15d
-; AVX512DQ-NEXT:    movl %r10d, %eax
-; AVX512DQ-NEXT:    shll $13, %eax
-; AVX512DQ-NEXT:    orl %r15d, %eax
-; AVX512DQ-NEXT:    movl %r10d, %r15d
-; AVX512DQ-NEXT:    shll $14, %r15d
-; AVX512DQ-NEXT:    orl %eax, %r15d
-; AVX512DQ-NEXT:    movl %r10d, %eax
-; AVX512DQ-NEXT:    shll $15, %eax
-; AVX512DQ-NEXT:    orl %r15d, %eax
-; AVX512DQ-NEXT:    movl %r10d, %r15d
-; AVX512DQ-NEXT:    shll $16, %r15d
-; AVX512DQ-NEXT:    orl %eax, %r15d
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
-; AVX512DQ-NEXT:    shll $17, %r10d
-; AVX512DQ-NEXT:    orl %r15d, %r10d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movl %r12d, %r15d
-; AVX512DQ-NEXT:    shll $18, %r15d
-; AVX512DQ-NEXT:    orl %r10d, %r15d
-; AVX512DQ-NEXT:    movl %r12d, %r10d
-; AVX512DQ-NEXT:    shll $19, %r10d
-; AVX512DQ-NEXT:    orl %r15d, %r10d
-; AVX512DQ-NEXT:    movl %r12d, %r15d
-; AVX512DQ-NEXT:    shll $20, %r15d
-; AVX512DQ-NEXT:    orl %r10d, %r15d
-; AVX512DQ-NEXT:    movl %r12d, %r10d
-; AVX512DQ-NEXT:    shll $21, %r10d
-; AVX512DQ-NEXT:    orl %r15d, %r10d
-; AVX512DQ-NEXT:    movl %r12d, %r15d
-; AVX512DQ-NEXT:    shll $22, %r15d
-; AVX512DQ-NEXT:    orl %r10d, %r15d
-; AVX512DQ-NEXT:    movl %r12d, %r10d
-; AVX512DQ-NEXT:    shll $23, %r10d
-; AVX512DQ-NEXT:    orl %r15d, %r10d
-; AVX512DQ-NEXT:    shll $24, %r12d
-; AVX512DQ-NEXT:    orl %r10d, %r12d
-; AVX512DQ-NEXT:    movl %eax, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movl %r15d, %r10d
-; AVX512DQ-NEXT:    shll $25, %r10d
-; AVX512DQ-NEXT:    orl %r12d, %r10d
-; AVX512DQ-NEXT:    movl %r15d, %r12d
-; AVX512DQ-NEXT:    shll $26, %r12d
-; AVX512DQ-NEXT:    orl %r10d, %r12d
-; AVX512DQ-NEXT:    movl %r15d, %r10d
-; AVX512DQ-NEXT:    shll $27, %r10d
-; AVX512DQ-NEXT:    orl %r12d, %r10d
-; AVX512DQ-NEXT:    movl %r15d, %r12d
-; AVX512DQ-NEXT:    shll $28, %r12d
-; AVX512DQ-NEXT:    orl %r10d, %r12d
-; AVX512DQ-NEXT:    movl %r15d, %r13d
-; AVX512DQ-NEXT:    shll $29, %r13d
-; AVX512DQ-NEXT:    orl %r12d, %r13d
-; AVX512DQ-NEXT:    kmovw %k2, %r10d
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512DQ-NEXT:    shll $30, %r15d
-; AVX512DQ-NEXT:    orl %r13d, %r15d
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
-; AVX512DQ-NEXT:    shll $31, %eax
-; AVX512DQ-NEXT:    orl %r15d, %eax
-; AVX512DQ-NEXT:    orl %ebx, %eax
-; AVX512DQ-NEXT:    movl %eax, 24(%rsi)
-; AVX512DQ-NEXT:    kmovw %k1, %eax
-; AVX512DQ-NEXT:    movzbl %al, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %rbx
-; AVX512DQ-NEXT:    leaq (%rbx,%rax,4), %rbx
-; AVX512DQ-NEXT:    leaq (%rbx,%rax,8), %rbx
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $4, %r15
-; AVX512DQ-NEXT:    orq %rbx, %r15
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $5, %rbx
-; AVX512DQ-NEXT:    orq %r15, %rbx
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movzbl %r12b, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $7, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $9, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $11, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
-; AVX512DQ-NEXT:    shlq $13, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movzbl %r12b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $15, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $17, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $19, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
-; AVX512DQ-NEXT:    shlq $20, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $21, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $22, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $24, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $25, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $26, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $27, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %r10b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $28, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $29, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $30, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $32, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
-; AVX512DQ-NEXT:    shlq $34, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $36, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $37, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $38, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $39, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $40, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r10d
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $43, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $45, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r13
-; AVX512DQ-NEXT:    shlq $47, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
-; AVX512DQ-NEXT:    shlq $48, %r10
-; AVX512DQ-NEXT:    orq %r13, %r10
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $49, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $50, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $51, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $52, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $53, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $54, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k2
-; AVX512DQ-NEXT:    shlq $55, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $56, %r10
-; AVX512DQ-NEXT:    orq %r12, %r10
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $57, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $58, %r10
-; AVX512DQ-NEXT:    orq %r12, %r10
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $59, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $60, %r10
-; AVX512DQ-NEXT:    orq %r12, %r10
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    shlq $62, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %r15b, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r15
-; AVX512DQ-NEXT:    shlq $63, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    orq %rbx, %r15
-; AVX512DQ-NEXT:    movq %r15, (%rsi)
-; AVX512DQ-NEXT:    movzbl %r11b, %r11d
-; AVX512DQ-NEXT:    movl %r11d, %ebx
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    leaq (%rbx,%rbx,2), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%rbx,4), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%rbx,8), %rax
-; AVX512DQ-NEXT:    movq %rbx, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $5, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $7, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $9, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k2
-; AVX512DQ-NEXT:    shlq $11, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $13, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $15, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $17, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %eax
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
-; AVX512DQ-NEXT:    shlq $18, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $19, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $20, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $21, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $22, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $24, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $25, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %r14b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $26, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $27, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $28, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $29, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $30, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k2, %ebp
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k2
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k2
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    shlq $32, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $34, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $36, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $37, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $38, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k2
-; AVX512DQ-NEXT:    shlq $39, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $43, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %r13
-; AVX512DQ-NEXT:    shlq $45, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k2
-; AVX512DQ-NEXT:    shlq $46, %r14
-; AVX512DQ-NEXT:    orq %r13, %r14
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $48, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $49, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $50, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $51, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $52, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r14d
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512DQ-NEXT:    shlq $53, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $55, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $57, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $58, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r14, %r12
-; AVX512DQ-NEXT:    shlq $59, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k1, %r13d
-; AVX512DQ-NEXT:    shlq $60, %r14
-; AVX512DQ-NEXT:    orq %r12, %r14
-; AVX512DQ-NEXT:    movq %r9, %r12
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %r14, %r12
-; AVX512DQ-NEXT:    kmovw %k0, %r14d
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
-; AVX512DQ-NEXT:    shlq $62, %r9
-; AVX512DQ-NEXT:    orq %r12, %r9
-; AVX512DQ-NEXT:    kmovw %k0, %r12d
-; AVX512DQ-NEXT:    movzbl %r8b, %eax
-; AVX512DQ-NEXT:    shlq $63, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %rax, 16(%rsi)
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    leaq (%r10,%r10,2), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%r10,4), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%r10,8), %rax
-; AVX512DQ-NEXT:    movq %r10, %r8
-; AVX512DQ-NEXT:    shlq $4, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    shlq $5, %r10
-; AVX512DQ-NEXT:    orq %r8, %r10
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $6, %r8
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $7, %r9
-; AVX512DQ-NEXT:    orq %r8, %r9
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $8, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $9, %r9
-; AVX512DQ-NEXT:    orq %r8, %r9
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $10, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $11, %r9
-; AVX512DQ-NEXT:    orq %r8, %r9
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movzbl %dil, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $13, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $15, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %rdi, %r8
-; AVX512DQ-NEXT:    shlq $17, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    shlq $19, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    movzbl %dl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $20, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $21, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $22, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $23, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $24, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $25, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $26, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movzbl %cl, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $27, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $28, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $29, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $30, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $31, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $33, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $35, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $37, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $39, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $40, %r13
-; AVX512DQ-NEXT:    orq %rcx, %r13
-; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $41, %rcx
-; AVX512DQ-NEXT:    orq %r13, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $42, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $43, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $44, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $45, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $46, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    andl $1, %r14d
-; AVX512DQ-NEXT:    movq %r14, %rcx
-; AVX512DQ-NEXT:    shlq $48, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $49, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r14, %rcx
-; AVX512DQ-NEXT:    shlq $50, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $51, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r14, %rcx
-; AVX512DQ-NEXT:    shlq $52, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r14, %rax
-; AVX512DQ-NEXT:    shlq $53, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $54, %r14
-; AVX512DQ-NEXT:    orq %rax, %r14
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %r14, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $56, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $58, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $59, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $60, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %rcx, %r12
-; AVX512DQ-NEXT:    shlq $62, %rbx
-; AVX512DQ-NEXT:    orq %r12, %rbx
-; AVX512DQ-NEXT:    shlq $63, %r11
-; AVX512DQ-NEXT:    orq %rbx, %r11
-; AVX512DQ-NEXT:    orq %r10, %r11
-; AVX512DQ-NEXT:    movq %r11, 8(%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r13
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm11, %zmm11
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm13, %zmm3
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm13 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
+; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm11 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
+; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm9 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
+; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
+; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm12 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 512(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 576(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 640(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 704(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 768(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 832(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor7_vf32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-NEXT:    kmovd (%rdi), %k6
+; AVX512BW-NEXT:    movw $-3, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kandw %k0, %k6, %k1
+; AVX512BW-NEXT:    kmovq %k0, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-5, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovq %k2, %k3
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-9, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-17, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-33, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-65, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    movw $-129, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrd $1, %k6, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512BW-NEXT:    kmovd %eax, %k7
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrd $2, %k6, %k2
+; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    korw %k1, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kmovq %k6, %k2
+; AVX512BW-NEXT:    kshiftrd $29, %k6, %k1
+; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovq %k4, %k6
+; AVX512BW-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k4, %k1, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrd $30, %k2, %k1
+; AVX512BW-NEXT:    kmovq %k2, %k4
+; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k7, %k0, %k3
+; AVX512BW-NEXT:    kshiftrd $31, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k3, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k1, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $27, %k2, %k1
+; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kandw %k6, %k1, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k7
+; AVX512BW-NEXT:    kshiftrd $28, %k2, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k7, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k6, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $4, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    korw %k7, %k0, %k2
+; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $25, %k6, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k5
+; AVX512BW-NEXT:    kshiftrd $26, %k6, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k4, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k5, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    korw %k6, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $23, %k3, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k2
+; AVX512BW-NEXT:    kshiftrd $22, %k3, %k5
+; AVX512BW-NEXT:    kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovq %k3, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k3
+; AVX512BW-NEXT:    kshiftrw $14, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $24, %k6, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k5
+; AVX512BW-NEXT:    kshiftrw $7, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $4, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $2, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    korw %k0, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm4 {%k2} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $20, %k3, %k5
+; AVX512BW-NEXT:    kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k6
+; AVX512BW-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k2, %k5
+; AVX512BW-NEXT:    kshiftrd $21, %k3, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $10, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k7, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k5, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k2, %k5
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $15, %k7, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $2, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k4, %k5, %k5
+; AVX512BW-NEXT:    kshiftlw $14, %k7, %k1
+; AVX512BW-NEXT:    korw %k1, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $18, %k4, %k2
+; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k7
+; AVX512BW-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $13, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $12, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $11, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k6
+; AVX512BW-NEXT:    kshiftrd $19, %k4, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $10, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $9, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $8, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $6, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $5, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k6, %k5
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k6
+; AVX512BW-NEXT:    kmovq %k3, %k7
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k3
+; AVX512BW-NEXT:    korw %k3, %k5, %k3
+; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT:    korw %k7, %k3, %k3
+; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm6 {%k3} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k3, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k3
+; AVX512BW-NEXT:    kshiftrd $17, %k1, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k3, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k2
+; AVX512BW-NEXT:    korw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    korw %k1, %k0, %k1
+; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm7 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
 ; AVX512BW-NEXT:    kshiftrd $13, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    kshiftrd $12, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrd $11, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrd $10, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrd $27, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrd $22, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrd $19, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrd $18, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrd $28, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $29, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrd $30, %k0, %k1
-; AVX512BW-NEXT:    movl %r9d, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    leal (%r10,%r10,2), %r14d
-; AVX512BW-NEXT:    leal (%r14,%r10,4), %r14d
-; AVX512BW-NEXT:    leal (%r14,%r10,8), %r14d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movl %r15d, %r12d
-; AVX512BW-NEXT:    shll $4, %r12d
-; AVX512BW-NEXT:    orl %r14d, %r12d
-; AVX512BW-NEXT:    movl %r15d, %r14d
-; AVX512BW-NEXT:    shll $5, %r14d
-; AVX512BW-NEXT:    orl %r12d, %r14d
-; AVX512BW-NEXT:    movl %r15d, %r12d
-; AVX512BW-NEXT:    shll $6, %r12d
-; AVX512BW-NEXT:    movl %r15d, %r13d
-; AVX512BW-NEXT:    shll $7, %r13d
-; AVX512BW-NEXT:    orl %r12d, %r13d
-; AVX512BW-NEXT:    movl %r15d, %r12d
-; AVX512BW-NEXT:    shll $8, %r12d
-; AVX512BW-NEXT:    orl %r13d, %r12d
-; AVX512BW-NEXT:    movl %r15d, %r13d
-; AVX512BW-NEXT:    shll $9, %r13d
-; AVX512BW-NEXT:    orl %r12d, %r13d
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $31, %k0, %k1
-; AVX512BW-NEXT:    shll $10, %r15d
-; AVX512BW-NEXT:    orl %r13d, %r15d
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movl %ebx, %r13d
-; AVX512BW-NEXT:    shll $11, %r13d
-; AVX512BW-NEXT:    orl %r15d, %r13d
-; AVX512BW-NEXT:    movl %ebx, %r15d
-; AVX512BW-NEXT:    shll $12, %r15d
-; AVX512BW-NEXT:    orl %r13d, %r15d
-; AVX512BW-NEXT:    movl %ebx, %r13d
-; AVX512BW-NEXT:    shll $13, %r13d
-; AVX512BW-NEXT:    orl %r15d, %r13d
-; AVX512BW-NEXT:    movl %ebx, %r15d
-; AVX512BW-NEXT:    shll $14, %r15d
-; AVX512BW-NEXT:    orl %r13d, %r15d
-; AVX512BW-NEXT:    movl %ebx, %r13d
-; AVX512BW-NEXT:    shll $15, %r13d
-; AVX512BW-NEXT:    orl %r15d, %r13d
-; AVX512BW-NEXT:    movl %ebx, %eax
-; AVX512BW-NEXT:    shll $16, %eax
-; AVX512BW-NEXT:    orl %r13d, %eax
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $4, %k0, %k1
-; AVX512BW-NEXT:    shll $17, %ebx
-; AVX512BW-NEXT:    orl %eax, %ebx
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movl %r12d, %eax
-; AVX512BW-NEXT:    shll $18, %eax
-; AVX512BW-NEXT:    orl %ebx, %eax
-; AVX512BW-NEXT:    movl %r12d, %ebx
-; AVX512BW-NEXT:    shll $19, %ebx
-; AVX512BW-NEXT:    orl %eax, %ebx
-; AVX512BW-NEXT:    movl %r12d, %eax
-; AVX512BW-NEXT:    shll $20, %eax
-; AVX512BW-NEXT:    orl %ebx, %eax
-; AVX512BW-NEXT:    movl %r12d, %ebx
-; AVX512BW-NEXT:    shll $21, %ebx
-; AVX512BW-NEXT:    orl %eax, %ebx
-; AVX512BW-NEXT:    movl %r12d, %eax
-; AVX512BW-NEXT:    shll $22, %eax
-; AVX512BW-NEXT:    orl %ebx, %eax
-; AVX512BW-NEXT:    movl %r12d, %ebx
-; AVX512BW-NEXT:    shll $23, %ebx
-; AVX512BW-NEXT:    orl %eax, %ebx
-; AVX512BW-NEXT:    shll $24, %r12d
-; AVX512BW-NEXT:    orl %ebx, %r12d
-; AVX512BW-NEXT:    movl %r15d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movl %eax, %ebx
-; AVX512BW-NEXT:    shll $25, %ebx
-; AVX512BW-NEXT:    orl %r12d, %ebx
-; AVX512BW-NEXT:    movl %eax, %r12d
-; AVX512BW-NEXT:    shll $26, %r12d
-; AVX512BW-NEXT:    orl %ebx, %r12d
-; AVX512BW-NEXT:    movl %eax, %ebx
-; AVX512BW-NEXT:    shll $27, %ebx
-; AVX512BW-NEXT:    orl %r12d, %ebx
-; AVX512BW-NEXT:    movl %eax, %r12d
-; AVX512BW-NEXT:    shll $28, %r12d
-; AVX512BW-NEXT:    orl %ebx, %r12d
-; AVX512BW-NEXT:    movl %eax, %r13d
-; AVX512BW-NEXT:    shll $29, %r13d
-; AVX512BW-NEXT:    orl %r12d, %r13d
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrd $1, %k0, %k1
-; AVX512BW-NEXT:    shll $30, %eax
-; AVX512BW-NEXT:    orl %r13d, %eax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $2, %k0, %k1
-; AVX512BW-NEXT:    shll $31, %r15d
-; AVX512BW-NEXT:    orl %eax, %r15d
-; AVX512BW-NEXT:    orl %r14d, %r15d
-; AVX512BW-NEXT:    movl %r15d, 24(%rsi)
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r14
-; AVX512BW-NEXT:    leaq (%r14,%rax,4), %r14
-; AVX512BW-NEXT:    leaq (%r14,%rax,8), %r14
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    shlq $4, %r15
-; AVX512BW-NEXT:    orq %r14, %r15
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    shlq $5, %r14
-; AVX512BW-NEXT:    orq %r15, %r14
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movzbl %r12b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $11, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $3, %k0, %k1
-; AVX512BW-NEXT:    shlq $13, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movzbl %r12b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $15, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $17, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $19, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrd $9, %k0, %k1
-; AVX512BW-NEXT:    shlq $20, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %al, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $21, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $22, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $23, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $24, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $25, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $26, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    shlq $27, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %bl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $28, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    shlq $29, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $30, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrd $5, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %r12
-; AVX512BW-NEXT:    shlq $32, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $6, %k0, %k1
-; AVX512BW-NEXT:    shlq $34, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %rbx
-; AVX512BW-NEXT:    shlq $36, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $37, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %rbx
-; AVX512BW-NEXT:    shlq $38, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $39, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $40, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrd $7, %k0, %k1
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbx, %r12
-; AVX512BW-NEXT:    shlq $43, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbx, %r12
-; AVX512BW-NEXT:    shlq $45, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $47, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $8, %k0, %k1
-; AVX512BW-NEXT:    shlq $48, %rbx
-; AVX512BW-NEXT:    orq %r13, %rbx
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $49, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %rbx
-; AVX512BW-NEXT:    shlq $50, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $51, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %rbx
-; AVX512BW-NEXT:    shlq $52, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $53, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r12, %rbx
-; AVX512BW-NEXT:    shlq $54, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrd $20, %k0, %k1
-; AVX512BW-NEXT:    shlq $55, %r12
-; AVX512BW-NEXT:    orq %rbx, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $57, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $59, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $61, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    shlq $62, %r13
-; AVX512BW-NEXT:    orq %rbx, %r13
-; AVX512BW-NEXT:    movzbl %r15b, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $63, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %rax, (%rsi)
-; AVX512BW-NEXT:    movzbl %cl, %r14d
-; AVX512BW-NEXT:    movl %r14d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r15
-; AVX512BW-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512BW-NEXT:    leaq (%r15,%rax,8), %r15
-; AVX512BW-NEXT:    shlq $4, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $5, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrd $21, %k0, %k1
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $13, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $15, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $17, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrd $23, %k0, %k1
-; AVX512BW-NEXT:    shlq $18, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $19, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $20, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $21, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $22, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $23, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $24, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    shlq $25, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movzbl %r11b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $27, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $28, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $29, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrd $24, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrd $25, %k0, %k1
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    shlq $32, %r11
-; AVX512BW-NEXT:    orq %r12, %r11
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r13, %r11
-; AVX512BW-NEXT:    shlq $34, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r13, %r11
-; AVX512BW-NEXT:    shlq $36, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $37, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $38, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrd $26, %k0, %k1
-; AVX512BW-NEXT:    shlq $39, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbp, %r12
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbp, %r12
-; AVX512BW-NEXT:    shlq $43, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbp, %r13
-; AVX512BW-NEXT:    shlq $45, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $14, %k0, %k1
-; AVX512BW-NEXT:    shlq $46, %rbp
-; AVX512BW-NEXT:    orq %r13, %rbp
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $47, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %r11, %r13
-; AVX512BW-NEXT:    shlq $48, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $49, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r11, %r13
-; AVX512BW-NEXT:    shlq $50, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $51, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r11, %r13
-; AVX512BW-NEXT:    shlq $52, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrd $15, %k0, %k1
-; AVX512BW-NEXT:    shlq $53, %r11
-; AVX512BW-NEXT:    orq %r13, %r11
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r12, %r11
-; AVX512BW-NEXT:    shlq $55, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r12, %r11
-; AVX512BW-NEXT:    shlq $57, %r11
-; AVX512BW-NEXT:    orq %rax, %r11
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $59, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
-; AVX512BW-NEXT:    shlq $60, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r10, %r13
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrd $17, %k0, %k0
-; AVX512BW-NEXT:    shlq $62, %r10
-; AVX512BW-NEXT:    orq %r13, %r10
-; AVX512BW-NEXT:    kmovd %k0, %r13d
-; AVX512BW-NEXT:    movzbl %r9b, %eax
-; AVX512BW-NEXT:    shlq $63, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %rax, 16(%rsi)
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    leaq (%rbx,%rbx,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%rbx,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%rbx,8), %rax
-; AVX512BW-NEXT:    movq %rbx, %r9
-; AVX512BW-NEXT:    shlq $4, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    shlq $5, %rbx
-; AVX512BW-NEXT:    orq %r9, %rbx
-; AVX512BW-NEXT:    movzbl %r8b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $6, %r8
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $7, %r9
-; AVX512BW-NEXT:    orq %r8, %r9
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $8, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $9, %r9
-; AVX512BW-NEXT:    orq %r8, %r9
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $10, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $11, %r9
-; AVX512BW-NEXT:    orq %r8, %r9
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movzbl %dil, %edi
-; AVX512BW-NEXT:    andl $1, %edi
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $13, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $15, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %rdi, %r8
-; AVX512BW-NEXT:    shlq $17, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    shlq $19, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $20, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $21, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $22, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $23, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $24, %rdx
-; AVX512BW-NEXT:    orq %rdi, %rdx
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $25, %rdi
-; AVX512BW-NEXT:    orq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $27, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $28, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $29, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $31, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    shlq $33, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rdx
-; AVX512BW-NEXT:    shlq $35, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rdx
-; AVX512BW-NEXT:    shlq $37, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rdx
-; AVX512BW-NEXT:    shlq $39, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $40, %rbp
-; AVX512BW-NEXT:    orq %rdx, %rbp
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $41, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %r11, %rdx
-; AVX512BW-NEXT:    shlq $42, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $43, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r11, %rdx
-; AVX512BW-NEXT:    shlq $44, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $45, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r11, %rdx
-; AVX512BW-NEXT:    shlq $46, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $47, %r11
-; AVX512BW-NEXT:    orq %rdx, %r11
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r11, %rax
-; AVX512BW-NEXT:    movq %r12, %rdx
-; AVX512BW-NEXT:    shlq $49, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r12, %rdx
-; AVX512BW-NEXT:    shlq $51, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r12, %rdx
-; AVX512BW-NEXT:    shlq $53, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $54, %r12
-; AVX512BW-NEXT:    orq %rdx, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $55, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rdx
-; AVX512BW-NEXT:    shlq $56, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $57, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r13, %rdx
-; AVX512BW-NEXT:    shlq $58, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $59, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %r13, %rdx
-; AVX512BW-NEXT:    shlq $60, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %rdx, %r13
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    shlq $62, %rcx
-; AVX512BW-NEXT:    orq %r13, %rcx
-; AVX512BW-NEXT:    shlq $63, %r14
-; AVX512BW-NEXT:    orq %rcx, %r14
-; AVX512BW-NEXT:    orq %rbx, %r14
-; AVX512BW-NEXT:    movq %r14, 8(%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kandw %k6, %k1, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k3
+; AVX512BW-NEXT:    kshiftrd $14, %k0, %k2
+; AVX512BW-NEXT:    kmovq %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k5
+; AVX512BW-NEXT:    korw %k5, %k3, %k3
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k5
+; AVX512BW-NEXT:    kshiftrd $15, %k1, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $4, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k5, %k2
+; AVX512BW-NEXT:    kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm8 {%k2} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $11, %k3, %k6
+; AVX512BW-NEXT:    kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k6, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k7, %k5, %k6
+; AVX512BW-NEXT:    kshiftrd $12, %k3, %k5
+; AVX512BW-NEXT:    kshiftlw $15, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $11, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $10, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $9, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $8, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $6, %k5, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $5, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k6, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $4, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $2, %k7, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512BW-NEXT:    korw %k1, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm9 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $9, %k6, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k5
+; AVX512BW-NEXT:    kshiftrd $10, %k6, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k4, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k5, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $2, %k4, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $7, %k4, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $6, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovq %k4, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k5, %k2
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k2
+; AVX512BW-NEXT:    kshiftrd $8, %k6, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k5
+; AVX512BW-NEXT:    kshiftrw $7, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $6, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $4, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k5, %k6
+; AVX512BW-NEXT:    korw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $2, %k5, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $4, %k6, %k1
+; AVX512BW-NEXT:    kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k2, %k5
+; AVX512BW-NEXT:    kshiftrd $5, %k6, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $11, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $10, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k3, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k5, %k2
+; AVX512BW-NEXT:    kandw %k4, %k2, %k5
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $15, %k3, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kandw %k7, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $3, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k5, %k5
+; AVX512BW-NEXT:    kshiftrw $2, %k2, %k6
+; AVX512BW-NEXT:    korw %k6, %k5, %k5
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k5, %k5
+; AVX512BW-NEXT:    kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT:    korw %k3, %k5, %k3
+; AVX512BW-NEXT:    kshiftlw $1, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm12 {%k2} {z}
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kshiftrw $14, %k4, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $13, %k4, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $12, %k4, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k4, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k2, %k3
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftrd $3, %k2, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $10, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $9, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $8, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k4
+; AVX512BW-NEXT:    korw %k4, %k3, %k3
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k3, %k3
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k3, %k2
+; AVX512BW-NEXT:    kandw %k1, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kandw %k7, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k3
+; AVX512BW-NEXT:    korw %k3, %k2, %k2
+; AVX512BW-NEXT:    kandw %k6, %k2, %k2
+; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k2, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 512(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 576(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 640(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 704(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 768(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 832(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <32 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-  store <224 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %data = call <224 x i32> @llvm.masked.load.v224i32.p0(ptr %in.vec, i32 64, <224 x i1> %tgt.mask, <224 x i32> poison)
+  store <224 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor7_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    pushq %rbp
-; AVX512F-ONLY-NEXT:    pushq %r15
-; AVX512F-ONLY-NEXT:    pushq %r14
-; AVX512F-ONLY-NEXT:    pushq %r13
-; AVX512F-ONLY-NEXT:    pushq %r12
-; AVX512F-ONLY-NEXT:    pushq %rbx
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
-; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k2
-; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k3
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ecx
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %edx
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %edi
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k2, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k2, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k3, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r11d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k4
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
-; AVX512F-ONLY-NEXT:    movzbl %r14b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    leaq (%r12,%r12,2), %r14
-; AVX512F-ONLY-NEXT:    leaq (%r14,%r12,4), %r14
-; AVX512F-ONLY-NEXT:    leaq (%r14,%r12,8), %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $4, %r15
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $5, %r14
-; AVX512F-ONLY-NEXT:    orq %r15, %r14
-; AVX512F-ONLY-NEXT:    shlq $6, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $7, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $8, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $9, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $10, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $11, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $12, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $13, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $15, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $17, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $19, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $20, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $21, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $22, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $24, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $25, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $26, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $27, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $28, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $29, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $30, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %r12
-; AVX512F-ONLY-NEXT:    shlq $32, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $34, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r8
-; AVX512F-ONLY-NEXT:    shlq $36, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $37, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r8
-; AVX512F-ONLY-NEXT:    shlq $38, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $39, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $40, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %r12
-; AVX512F-ONLY-NEXT:    shlq $43, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %r12
-; AVX512F-ONLY-NEXT:    shlq $45, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r8, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %r13
-; AVX512F-ONLY-NEXT:    shlq $47, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $48, %r8
-; AVX512F-ONLY-NEXT:    orq %r13, %r8
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r8
-; AVX512F-ONLY-NEXT:    shlq $50, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $51, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r8
-; AVX512F-ONLY-NEXT:    shlq $52, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $53, %rax
-; AVX512F-ONLY-NEXT:    orq %r8, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r8
-; AVX512F-ONLY-NEXT:    shlq $54, %r8
-; AVX512F-ONLY-NEXT:    orq %rax, %r8
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k4
-; AVX512F-ONLY-NEXT:    shlq $55, %r12
-; AVX512F-ONLY-NEXT:    orq %r8, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $56, %r8
-; AVX512F-ONLY-NEXT:    orq %r12, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $57, %r12
-; AVX512F-ONLY-NEXT:    orq %r8, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $58, %r8
-; AVX512F-ONLY-NEXT:    orq %r12, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $59, %r12
-; AVX512F-ONLY-NEXT:    orq %r8, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r8
-; AVX512F-ONLY-NEXT:    shlq $60, %r8
-; AVX512F-ONLY-NEXT:    orq %r12, %r8
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %r8, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $62, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
-; AVX512F-ONLY-NEXT:    movq %r8, %r12
-; AVX512F-ONLY-NEXT:    shlq $63, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    orq %r14, %r12
-; AVX512F-ONLY-NEXT:    movq %r12, (%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movzbl %r11b, %r14d
-; AVX512F-ONLY-NEXT:    movl %r14d, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,2), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,8), %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $5, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $6, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $8, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $9, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $10, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $11, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $12, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $13, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $14, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $15, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $16, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $17, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $18, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $19, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $20, %r10
-; AVX512F-ONLY-NEXT:    orq %rax, %r10
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $21, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r10d
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $22, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $23, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $24, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $25, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %rax
-; AVX512F-ONLY-NEXT:    shlq $26, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $27, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $28, %r10
-; AVX512F-ONLY-NEXT:    orq %r12, %r10
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $29, %r12
-; AVX512F-ONLY-NEXT:    orq %r10, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $30, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %r12
-; AVX512F-ONLY-NEXT:    shlq $32, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r9, %r13
-; AVX512F-ONLY-NEXT:    shlq $34, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $35, %r9
-; AVX512F-ONLY-NEXT:    orq %r13, %r9
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r9
-; AVX512F-ONLY-NEXT:    shlq $37, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r9
-; AVX512F-ONLY-NEXT:    shlq $39, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r9
-; AVX512F-ONLY-NEXT:    shlq $41, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $42, %r12
-; AVX512F-ONLY-NEXT:    orq %r9, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $43, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r9
-; AVX512F-ONLY-NEXT:    shlq $44, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $45, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r9
-; AVX512F-ONLY-NEXT:    shlq $46, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $48, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $49, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r9, %r12
-; AVX512F-ONLY-NEXT:    shlq $51, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r9, %r12
-; AVX512F-ONLY-NEXT:    shlq $53, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r9, %r12
-; AVX512F-ONLY-NEXT:    shlq $55, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $56, %r9
-; AVX512F-ONLY-NEXT:    orq %r12, %r9
-; AVX512F-ONLY-NEXT:    movzbl %r10b, %eax
-; AVX512F-ONLY-NEXT:    # kill: def $r10d killed $r10d def $r10
-; AVX512F-ONLY-NEXT:    andl $1, %r10d
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $57, %r12
-; AVX512F-ONLY-NEXT:    orq %r9, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %r9
-; AVX512F-ONLY-NEXT:    shlq $58, %r9
-; AVX512F-ONLY-NEXT:    orq %r12, %r9
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $59, %r12
-; AVX512F-ONLY-NEXT:    orq %r9, %r12
-; AVX512F-ONLY-NEXT:    movq %r10, %r9
-; AVX512F-ONLY-NEXT:    shlq $60, %r9
-; AVX512F-ONLY-NEXT:    orq %r12, %r9
-; AVX512F-ONLY-NEXT:    movq %r10, %r12
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %r9, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $62, %r10
-; AVX512F-ONLY-NEXT:    orq %r12, %r10
-; AVX512F-ONLY-NEXT:    shlq $63, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %rax, 48(%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %r9b, %r10d
-; AVX512F-ONLY-NEXT:    movl %r10d, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r15
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,8), %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $4, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $5, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $6, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $7, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $8, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $9, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $10, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $11, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $12, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $13, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $14, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $15, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $16, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $17, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $18, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $19, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $20, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $21, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $22, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $23, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    movzbl %al, %ebp
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $24, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %r15
-; AVX512F-ONLY-NEXT:    shlq $25, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $26, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %r15
-; AVX512F-ONLY-NEXT:    shlq $27, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $28, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %r13
-; AVX512F-ONLY-NEXT:    shlq $29, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $30, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %rbp, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $32, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $34, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rbp
-; AVX512F-ONLY-NEXT:    shlq $36, %rbp
-; AVX512F-ONLY-NEXT:    orq %rax, %rbp
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $37, %r15
-; AVX512F-ONLY-NEXT:    orq %rbp, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $39, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $41, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r15
-; AVX512F-ONLY-NEXT:    shlq $43, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $44, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $45, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %r15
-; AVX512F-ONLY-NEXT:    shlq $46, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %r15
-; AVX512F-ONLY-NEXT:    shlq $48, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %r13
-; AVX512F-ONLY-NEXT:    shlq $50, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k4
-; AVX512F-ONLY-NEXT:    shlq $51, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $52, %rax
-; AVX512F-ONLY-NEXT:    orq %rbp, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $53, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $55, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $57, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $58, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    movzbl %bl, %r13d
-; AVX512F-ONLY-NEXT:    movl %ebx, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $59, %rbx
-; AVX512F-ONLY-NEXT:    orq %r15, %rbx
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $60, %r15
-; AVX512F-ONLY-NEXT:    orq %rbx, %r15
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $61, %rbx
-; AVX512F-ONLY-NEXT:    orq %r15, %rbx
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k2, %k4
-; AVX512F-ONLY-NEXT:    shlq $62, %rax
-; AVX512F-ONLY-NEXT:    orq %rbx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rbx
-; AVX512F-ONLY-NEXT:    shlq $63, %rbx
-; AVX512F-ONLY-NEXT:    orq %rax, %rbx
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k2, %k4
-; AVX512F-ONLY-NEXT:    orq %r12, %rbx
-; AVX512F-ONLY-NEXT:    movq %rbx, 32(%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %al, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    leaq (%r13,%r13,2), %rbx
-; AVX512F-ONLY-NEXT:    leaq (%rbx,%rax,4), %rbx
-; AVX512F-ONLY-NEXT:    leaq (%rbx,%rax,8), %rbx
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %rbx, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %rbx
-; AVX512F-ONLY-NEXT:    shlq $5, %rbx
-; AVX512F-ONLY-NEXT:    orq %r12, %rbx
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $6, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $7, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $9, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $11, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $13, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $15, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $17, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $19, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $20, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $21, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $22, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $24, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $25, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $26, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $27, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $28, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $29, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $30, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $31, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $33, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $35, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k3, %k4
-; AVX512F-ONLY-NEXT:    shlq $36, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %ebp
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $37, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %r12
-; AVX512F-ONLY-NEXT:    shlq $38, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $39, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %r12
-; AVX512F-ONLY-NEXT:    shlq $40, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %rbp, %rax
-; AVX512F-ONLY-NEXT:    shlq $41, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %rbp, %r13
-; AVX512F-ONLY-NEXT:    shlq $42, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k3, %k3
-; AVX512F-ONLY-NEXT:    shlq $43, %rbp
-; AVX512F-ONLY-NEXT:    orq %r13, %rbp
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %rbp, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $45, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $47, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $49, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $50, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $51, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $52, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $53, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $54, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $56, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $57, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    andl $1, %r11d
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $58, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %r12
-; AVX512F-ONLY-NEXT:    shlq $59, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r11, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r11, %r12
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $62, %r11
-; AVX512F-ONLY-NEXT:    orq %r12, %r11
-; AVX512F-ONLY-NEXT:    shlq $63, %r14
-; AVX512F-ONLY-NEXT:    orq %r11, %r14
-; AVX512F-ONLY-NEXT:    orq %rbx, %r14
-; AVX512F-ONLY-NEXT:    movq %r14, 40(%rsi)
-; AVX512F-ONLY-NEXT:    movzbl %al, %r11d
-; AVX512F-ONLY-NEXT:    movl %r11d, %ebx
-; AVX512F-ONLY-NEXT:    andl $1, %ebx
-; AVX512F-ONLY-NEXT:    leaq (%rbx,%rbx,2), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,4), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,8), %rax
-; AVX512F-ONLY-NEXT:    movq %rbx, %r12
-; AVX512F-ONLY-NEXT:    shlq $4, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %r14
-; AVX512F-ONLY-NEXT:    shlq $5, %r14
-; AVX512F-ONLY-NEXT:    orq %r12, %r14
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $7, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $9, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $10, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $11, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $13, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $15, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r13, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $17, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $18, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $19, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $20, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $21, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $22, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $23, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %r12
-; AVX512F-ONLY-NEXT:    shlq $24, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $25, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $26, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $27, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $28, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r13
-; AVX512F-ONLY-NEXT:    shlq $29, %r13
-; AVX512F-ONLY-NEXT:    orq %r12, %r13
-; AVX512F-ONLY-NEXT:    movq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $30, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $31, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k3
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    shlq $32, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $34, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $35, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $36, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $37, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $38, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $39, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $40, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $41, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $42, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $43, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $44, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r13
-; AVX512F-ONLY-NEXT:    shlq $45, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $46, %r15
-; AVX512F-ONLY-NEXT:    orq %r13, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $48, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $50, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $51, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $52, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k3
-; AVX512F-ONLY-NEXT:    shlq $53, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $55, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $56, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $57, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $58, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $59, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $60, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    movl %ebp, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $61, %r13
-; AVX512F-ONLY-NEXT:    orq %r15, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
-; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $62, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    movq %rax, %r15
-; AVX512F-ONLY-NEXT:    shlq $63, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
-; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k3
-; AVX512F-ONLY-NEXT:    orq %r14, %r15
-; AVX512F-ONLY-NEXT:    movq %r15, 16(%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r14
-; AVX512F-ONLY-NEXT:    leaq (%r14,%rax,4), %r14
-; AVX512F-ONLY-NEXT:    leaq (%r14,%rax,8), %rax
-; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $4, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %r14
-; AVX512F-ONLY-NEXT:    shlq $5, %r14
-; AVX512F-ONLY-NEXT:    orq %r15, %r14
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $6, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $7, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $8, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $9, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $10, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $11, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $12, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $13, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $14, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $15, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $16, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k3
-; AVX512F-ONLY-NEXT:    shlq $17, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $19, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $20, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $21, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $22, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $23, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $24, %r12
-; AVX512F-ONLY-NEXT:    orq %r15, %r12
-; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $25, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $26, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $27, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $28, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $29, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $30, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    shlq $31, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $33, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $35, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $37, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k3, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512F-ONLY-NEXT:    shlq $38, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $39, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $40, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $41, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $42, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $43, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $44, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
-; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k2
-; AVX512F-ONLY-NEXT:    shlq $45, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $46, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $47, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $48, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r15
-; AVX512F-ONLY-NEXT:    shlq $49, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $50, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %r13
-; AVX512F-ONLY-NEXT:    shlq $51, %r13
-; AVX512F-ONLY-NEXT:    orq %rax, %r13
-; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512F-ONLY-NEXT:    shlq $52, %r12
-; AVX512F-ONLY-NEXT:    orq %r13, %r12
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $53, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $54, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $56, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %r12, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %r12
-; AVX512F-ONLY-NEXT:    shlq $58, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
-; AVX512F-ONLY-NEXT:    shlq $59, %r15
-; AVX512F-ONLY-NEXT:    orq %r12, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r9d
-; AVX512F-ONLY-NEXT:    movq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $60, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r9, %r12
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %rax, %r12
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r15d
-; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k0
-; AVX512F-ONLY-NEXT:    shlq $62, %r9
-; AVX512F-ONLY-NEXT:    orq %r12, %r9
-; AVX512F-ONLY-NEXT:    kmovw %k0, %r12d
-; AVX512F-ONLY-NEXT:    shlq $63, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    orq %r14, %r10
-; AVX512F-ONLY-NEXT:    movq %r10, 24(%rsi)
-; AVX512F-ONLY-NEXT:    andl $1, %r8d
-; AVX512F-ONLY-NEXT:    leaq (%r8,%r8,2), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r8,4), %rax
-; AVX512F-ONLY-NEXT:    leaq (%rax,%r8,8), %rax
-; AVX512F-ONLY-NEXT:    movq %r8, %r9
-; AVX512F-ONLY-NEXT:    shlq $4, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $5, %r8
-; AVX512F-ONLY-NEXT:    orq %r9, %r8
-; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $6, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $7, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $8, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $9, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    movq %rax, %r9
-; AVX512F-ONLY-NEXT:    shlq $10, %r9
-; AVX512F-ONLY-NEXT:    orq %r10, %r9
-; AVX512F-ONLY-NEXT:    movq %rax, %r10
-; AVX512F-ONLY-NEXT:    shlq $11, %r10
-; AVX512F-ONLY-NEXT:    orq %r9, %r10
-; AVX512F-ONLY-NEXT:    shlq $12, %rax
-; AVX512F-ONLY-NEXT:    orq %r10, %rax
-; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
-; AVX512F-ONLY-NEXT:    andl $1, %edi
-; AVX512F-ONLY-NEXT:    movq %rdi, %r9
-; AVX512F-ONLY-NEXT:    shlq $13, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $14, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %r9
-; AVX512F-ONLY-NEXT:    shlq $15, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $16, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    movq %rdi, %r9
-; AVX512F-ONLY-NEXT:    shlq $17, %r9
-; AVX512F-ONLY-NEXT:    orq %rax, %r9
-; AVX512F-ONLY-NEXT:    movq %rdi, %rax
-; AVX512F-ONLY-NEXT:    shlq $18, %rax
-; AVX512F-ONLY-NEXT:    orq %r9, %rax
-; AVX512F-ONLY-NEXT:    shlq $19, %rdi
-; AVX512F-ONLY-NEXT:    orq %rax, %rdi
-; AVX512F-ONLY-NEXT:    movzbl %dl, %eax
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $20, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $21, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $22, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $23, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $24, %rdx
-; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdi
-; AVX512F-ONLY-NEXT:    shlq $25, %rdi
-; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
-; AVX512F-ONLY-NEXT:    shlq $26, %rax
-; AVX512F-ONLY-NEXT:    orq %rdi, %rax
-; AVX512F-ONLY-NEXT:    movzbl %cl, %ecx
-; AVX512F-ONLY-NEXT:    andl $1, %ecx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $27, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $28, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $29, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $30, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $31, %rdx
-; AVX512F-ONLY-NEXT:    orq %rax, %rdx
-; AVX512F-ONLY-NEXT:    movq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $32, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    shlq $33, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    andl $1, %r13d
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $34, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $35, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $36, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $37, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r13, %rax
-; AVX512F-ONLY-NEXT:    shlq $38, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r13, %rcx
-; AVX512F-ONLY-NEXT:    shlq $39, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $40, %r13
-; AVX512F-ONLY-NEXT:    orq %rcx, %r13
-; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512F-ONLY-NEXT:    andl $1, %eax
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $41, %rcx
-; AVX512F-ONLY-NEXT:    orq %r13, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $42, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $43, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $44, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    movq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $45, %rcx
-; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
-; AVX512F-ONLY-NEXT:    movq %rax, %rdx
-; AVX512F-ONLY-NEXT:    shlq $46, %rdx
-; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
-; AVX512F-ONLY-NEXT:    shlq $47, %rax
-; AVX512F-ONLY-NEXT:    orq %rdx, %rax
-; AVX512F-ONLY-NEXT:    andl $1, %r15d
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $48, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $49, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $50, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $51, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r15, %rcx
-; AVX512F-ONLY-NEXT:    shlq $52, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r15, %rax
-; AVX512F-ONLY-NEXT:    shlq $53, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    shlq $54, %r15
-; AVX512F-ONLY-NEXT:    orq %rax, %r15
-; AVX512F-ONLY-NEXT:    andl $1, %r12d
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $55, %rax
-; AVX512F-ONLY-NEXT:    orq %r15, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $56, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $57, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $58, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    movq %r12, %rax
-; AVX512F-ONLY-NEXT:    shlq $59, %rax
-; AVX512F-ONLY-NEXT:    orq %rcx, %rax
-; AVX512F-ONLY-NEXT:    movq %r12, %rcx
-; AVX512F-ONLY-NEXT:    shlq $60, %rcx
-; AVX512F-ONLY-NEXT:    orq %rax, %rcx
-; AVX512F-ONLY-NEXT:    shlq $61, %r12
-; AVX512F-ONLY-NEXT:    orq %rcx, %r12
-; AVX512F-ONLY-NEXT:    shlq $62, %rbx
-; AVX512F-ONLY-NEXT:    orq %r12, %rbx
-; AVX512F-ONLY-NEXT:    shlq $63, %r11
-; AVX512F-ONLY-NEXT:    orq %rbx, %r11
-; AVX512F-ONLY-NEXT:    orq %r8, %r11
-; AVX512F-ONLY-NEXT:    movq %r11, 8(%rsi)
-; AVX512F-ONLY-NEXT:    popq %rbx
-; AVX512F-ONLY-NEXT:    popq %r12
-; AVX512F-ONLY-NEXT:    popq %r13
-; AVX512F-ONLY-NEXT:    popq %r14
-; AVX512F-ONLY-NEXT:    popq %r15
-; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    movw $1, %ax
+; AVX512F-ONLY-NEXT:    kmovw %eax, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm2, %zmm6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm7, %zmm8
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm9, %zmm10
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm11, %zmm12
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm13, %zmm14
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm15, %zmm16
+; AVX512F-ONLY-NEXT:    vpermd %zmm3, %zmm1, %zmm3
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm2, %zmm17
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm7, %zmm18
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm9, %zmm19
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm11, %zmm20
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm13, %zmm21
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm15, %zmm22
+; AVX512F-ONLY-NEXT:    vpermd %zmm4, %zmm1, %zmm4
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm2, %zmm23
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm7, %zmm24
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm9, %zmm25
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm11, %zmm26
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm13, %zmm27
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vpermd %zmm5, %zmm15, %zmm5
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm7
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm9
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm11
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm13
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm15, %zmm0
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm15 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm13 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm27, %zmm27, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm27 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm26, %zmm26, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm26 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm25, %zmm25, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm25 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm24, %zmm24, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm24 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm23, %zmm23, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm23 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm22, %zmm22, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm22 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm21, %zmm21, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1024(%rsi), %zmm21 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm20, %zmm20, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1088(%rsi), %zmm20 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm19, %zmm19, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1152(%rsi), %zmm19 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm18, %zmm18, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1216(%rsi), %zmm18 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm17, %zmm17, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1280(%rsi), %zmm17 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1344(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm16, %zmm16, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1408(%rsi), %zmm16 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1472(%rsi), %zmm14 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1536(%rsi), %zmm12 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1600(%rsi), %zmm10 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1664(%rsi), %zmm8 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1728(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 1728(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 1664(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 1600(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 1536(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 1472(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm16, 1408(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 1344(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm17, 1280(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm18, 1216(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm19, 1152(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm20, 1088(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm21, 1024(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm22, 960(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 896(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm23, 832(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm24, 768(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm25, 704(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm26, 640(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm27, 576(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 512(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, (%rdx)
+; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor7_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    pushq %rbp
-; AVX512DQ-NEXT:    pushq %r15
-; AVX512DQ-NEXT:    pushq %r14
-; AVX512DQ-NEXT:    pushq %r13
-; AVX512DQ-NEXT:    pushq %r12
-; AVX512DQ-NEXT:    pushq %rbx
-; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
+; AVX512DQ-NEXT:    movw $1, %ax
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm3
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm4
 ; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
-; AVX512DQ-NEXT:    kmovw 4(%rdi), %k2
-; AVX512DQ-NEXT:    kmovw 6(%rdi), %k3
-; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ecx
-; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %edx
-; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %edi
-; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ebx
-; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $11, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r9d
-; AVX512DQ-NEXT:    kshiftrw $8, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r10d
-; AVX512DQ-NEXT:    kshiftrw $6, %k3, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r11d
-; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %r8d
-; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k4
-; AVX512DQ-NEXT:    kmovw %k1, %r14d
-; AVX512DQ-NEXT:    movzbl %r14b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    leaq (%r12,%r12,2), %r14
-; AVX512DQ-NEXT:    leaq (%r14,%r12,4), %r14
-; AVX512DQ-NEXT:    leaq (%r14,%r12,8), %r14
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $4, %r15
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $5, %r14
-; AVX512DQ-NEXT:    orq %r15, %r14
-; AVX512DQ-NEXT:    shlq $6, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $7, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $8, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $9, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $10, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $11, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $12, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k4
-; AVX512DQ-NEXT:    shlq $13, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $15, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $17, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $19, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $7, %k3, %k4
-; AVX512DQ-NEXT:    shlq $20, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $21, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $22, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $24, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $25, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $26, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $27, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %r8b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $28, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $29, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $30, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k4
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %r12
-; AVX512DQ-NEXT:    shlq $32, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k4
-; AVX512DQ-NEXT:    shlq $34, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %r12, %r8
-; AVX512DQ-NEXT:    shlq $36, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $37, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %r12, %r8
-; AVX512DQ-NEXT:    shlq $38, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $39, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $40, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r8d
-; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k4
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r8, %r12
-; AVX512DQ-NEXT:    shlq $43, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r8, %r12
-; AVX512DQ-NEXT:    shlq $45, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r8, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r8, %r13
-; AVX512DQ-NEXT:    shlq $47, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k4
-; AVX512DQ-NEXT:    shlq $48, %r8
-; AVX512DQ-NEXT:    orq %r13, %r8
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $49, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %r12, %r8
-; AVX512DQ-NEXT:    shlq $50, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $51, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %r12, %r8
-; AVX512DQ-NEXT:    shlq $52, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $53, %rax
-; AVX512DQ-NEXT:    orq %r8, %rax
-; AVX512DQ-NEXT:    movq %r12, %r8
-; AVX512DQ-NEXT:    shlq $54, %r8
-; AVX512DQ-NEXT:    orq %rax, %r8
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k4
-; AVX512DQ-NEXT:    shlq $55, %r12
-; AVX512DQ-NEXT:    orq %r8, %r12
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $56, %r8
-; AVX512DQ-NEXT:    orq %r12, %r8
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $57, %r12
-; AVX512DQ-NEXT:    orq %r8, %r12
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $58, %r8
-; AVX512DQ-NEXT:    orq %r12, %r8
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $59, %r12
-; AVX512DQ-NEXT:    orq %r8, %r12
-; AVX512DQ-NEXT:    movq %rax, %r8
-; AVX512DQ-NEXT:    shlq $60, %r8
-; AVX512DQ-NEXT:    orq %r12, %r8
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %r8, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r8d
-; AVX512DQ-NEXT:    kshiftrw $9, %k3, %k4
-; AVX512DQ-NEXT:    shlq $62, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %r8b, %r8d
-; AVX512DQ-NEXT:    movq %r8, %r12
-; AVX512DQ-NEXT:    shlq $63, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    orq %r14, %r12
-; AVX512DQ-NEXT:    movq %r12, (%rsi)
-; AVX512DQ-NEXT:    movzbl %r15b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movzbl %r11b, %r14d
-; AVX512DQ-NEXT:    movl %r14d, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    leaq (%r15,%rax,2), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%rax,8), %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $5, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $6, %r12
-; AVX512DQ-NEXT:    shlq $7, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movzbl %r10b, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $8, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $9, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $10, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $11, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $12, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $13, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $10, %k3, %k4
-; AVX512DQ-NEXT:    shlq $14, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movzbl %r12b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $15, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $16, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $17, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $18, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $19, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movq %r12, %r10
-; AVX512DQ-NEXT:    shlq $20, %r10
-; AVX512DQ-NEXT:    orq %rax, %r10
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $15, %k3, %k4
-; AVX512DQ-NEXT:    shlq $21, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r10d
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $22, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $23, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $24, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $25, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r10, %rax
-; AVX512DQ-NEXT:    shlq $26, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $27, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $28, %r10
-; AVX512DQ-NEXT:    orq %r12, %r10
-; AVX512DQ-NEXT:    movzbl %r9b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $29, %r12
-; AVX512DQ-NEXT:    orq %r10, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $30, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r10d
-; AVX512DQ-NEXT:    kshiftrw $12, %k3, %k4
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %r12
-; AVX512DQ-NEXT:    shlq $32, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r9, %r13
-; AVX512DQ-NEXT:    shlq $34, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k4
-; AVX512DQ-NEXT:    shlq $35, %r9
-; AVX512DQ-NEXT:    orq %r13, %r9
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %r12, %r9
-; AVX512DQ-NEXT:    shlq $37, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %r12, %r9
-; AVX512DQ-NEXT:    shlq $39, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %r12, %r9
-; AVX512DQ-NEXT:    shlq $41, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $14, %k3, %k4
-; AVX512DQ-NEXT:    shlq $42, %r12
-; AVX512DQ-NEXT:    orq %r9, %r12
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $43, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r9
-; AVX512DQ-NEXT:    shlq $44, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $45, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %r13, %r9
-; AVX512DQ-NEXT:    shlq $46, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $48, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r9d
-; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k4
-; AVX512DQ-NEXT:    shlq $49, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r9, %r12
-; AVX512DQ-NEXT:    shlq $51, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r9, %r12
-; AVX512DQ-NEXT:    shlq $53, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r9, %r12
-; AVX512DQ-NEXT:    shlq $55, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $56, %r9
-; AVX512DQ-NEXT:    orq %r12, %r9
-; AVX512DQ-NEXT:    movzbl %r10b, %eax
-; AVX512DQ-NEXT:    # kill: def $r10d killed $r10d def $r10
-; AVX512DQ-NEXT:    andl $1, %r10d
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $57, %r12
-; AVX512DQ-NEXT:    orq %r9, %r12
-; AVX512DQ-NEXT:    movq %r10, %r9
-; AVX512DQ-NEXT:    shlq $58, %r9
-; AVX512DQ-NEXT:    orq %r12, %r9
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $59, %r12
-; AVX512DQ-NEXT:    orq %r9, %r12
-; AVX512DQ-NEXT:    movq %r10, %r9
-; AVX512DQ-NEXT:    shlq $60, %r9
-; AVX512DQ-NEXT:    orq %r12, %r9
-; AVX512DQ-NEXT:    movq %r10, %r12
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %r9, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %r9d
-; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k4
-; AVX512DQ-NEXT:    shlq $62, %r10
-; AVX512DQ-NEXT:    orq %r12, %r10
-; AVX512DQ-NEXT:    shlq $63, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %rax, 48(%rsi)
-; AVX512DQ-NEXT:    movzbl %r9b, %r10d
-; AVX512DQ-NEXT:    movl %r10d, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r15
-; AVX512DQ-NEXT:    leaq (%r15,%rax,4), %r15
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%r15,%rax,8), %r15
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $4, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $5, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $6, %r15
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $7, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $8, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k4
-; AVX512DQ-NEXT:    shlq $9, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movzbl %bpl, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $10, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $11, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $12, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $13, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $14, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $15, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k4
-; AVX512DQ-NEXT:    shlq $16, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $17, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $18, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $19, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $20, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $21, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $22, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k4
-; AVX512DQ-NEXT:    shlq $23, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    movzbl %al, %ebp
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $24, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %rbp, %r15
-; AVX512DQ-NEXT:    shlq $25, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $26, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %rbp, %r15
-; AVX512DQ-NEXT:    shlq $27, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $28, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %rbp, %r13
-; AVX512DQ-NEXT:    shlq $29, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k4
-; AVX512DQ-NEXT:    shlq $30, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    movzbl %r15b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %rbp, %rax
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $32, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $34, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r15, %rbp
-; AVX512DQ-NEXT:    shlq $36, %rbp
-; AVX512DQ-NEXT:    orq %rax, %rbp
-; AVX512DQ-NEXT:    kmovw %k4, %r13d
-; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k4
-; AVX512DQ-NEXT:    shlq $37, %r15
-; AVX512DQ-NEXT:    orq %rbp, %r15
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $39, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $41, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r13, %r15
-; AVX512DQ-NEXT:    shlq $43, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k4
-; AVX512DQ-NEXT:    shlq $44, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $45, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %rbp, %r15
-; AVX512DQ-NEXT:    shlq $46, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %rbp, %r15
-; AVX512DQ-NEXT:    shlq $48, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $49, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %rbp, %r13
-; AVX512DQ-NEXT:    shlq $50, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k4
-; AVX512DQ-NEXT:    shlq $51, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $52, %rax
-; AVX512DQ-NEXT:    orq %rbp, %rax
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $53, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $55, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $57, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    shlq $58, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    movzbl %bl, %r13d
-; AVX512DQ-NEXT:    movl %ebx, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $59, %rbx
-; AVX512DQ-NEXT:    orq %r15, %rbx
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $60, %r15
-; AVX512DQ-NEXT:    orq %rbx, %r15
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $61, %rbx
-; AVX512DQ-NEXT:    orq %r15, %rbx
-; AVX512DQ-NEXT:    kmovw %k4, %r15d
-; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k4
-; AVX512DQ-NEXT:    shlq $62, %rax
-; AVX512DQ-NEXT:    orq %rbx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rbx
-; AVX512DQ-NEXT:    shlq $63, %rbx
-; AVX512DQ-NEXT:    orq %rax, %rbx
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k4
-; AVX512DQ-NEXT:    orq %r12, %rbx
-; AVX512DQ-NEXT:    movq %rbx, 32(%rsi)
-; AVX512DQ-NEXT:    movzbl %al, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    leaq (%r13,%r13,2), %rbx
-; AVX512DQ-NEXT:    leaq (%rbx,%rax,4), %rbx
-; AVX512DQ-NEXT:    leaq (%rbx,%rax,8), %rbx
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %rbx, %r12
-; AVX512DQ-NEXT:    movq %rax, %rbx
-; AVX512DQ-NEXT:    shlq $5, %rbx
-; AVX512DQ-NEXT:    orq %r12, %rbx
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $6, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $7, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $1, %k3, %k4
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $9, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $11, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $13, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    shlq $15, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $17, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $19, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $20, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $21, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
-; AVX512DQ-NEXT:    shlq $22, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $24, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $25, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $26, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $27, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $28, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %eax
-; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k4
-; AVX512DQ-NEXT:    shlq $29, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $30, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $31, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $33, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $35, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k4, %ebp
-; AVX512DQ-NEXT:    kshiftrw $4, %k3, %k4
-; AVX512DQ-NEXT:    shlq $36, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    andl $1, %ebp
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $37, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %rbp, %r12
-; AVX512DQ-NEXT:    shlq $38, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $39, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %rbp, %r12
-; AVX512DQ-NEXT:    shlq $40, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %rbp, %rax
-; AVX512DQ-NEXT:    shlq $41, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %rbp, %r13
-; AVX512DQ-NEXT:    shlq $42, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k4, %r12d
-; AVX512DQ-NEXT:    kshiftrw $5, %k3, %k3
-; AVX512DQ-NEXT:    shlq $43, %rbp
-; AVX512DQ-NEXT:    orq %r13, %rbp
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %rbp, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $45, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $47, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $49, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k3
-; AVX512DQ-NEXT:    shlq $50, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $51, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $52, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $53, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $54, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $56, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k3
-; AVX512DQ-NEXT:    shlq $57, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    andl $1, %r11d
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $58, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r11, %r12
-; AVX512DQ-NEXT:    shlq $59, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r11, %rax
-; AVX512DQ-NEXT:    shlq $60, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r11, %r12
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k3
-; AVX512DQ-NEXT:    shlq $62, %r11
-; AVX512DQ-NEXT:    orq %r12, %r11
-; AVX512DQ-NEXT:    shlq $63, %r14
-; AVX512DQ-NEXT:    orq %r11, %r14
-; AVX512DQ-NEXT:    orq %rbx, %r14
-; AVX512DQ-NEXT:    movq %r14, 40(%rsi)
-; AVX512DQ-NEXT:    movzbl %al, %r11d
-; AVX512DQ-NEXT:    movl %r11d, %ebx
-; AVX512DQ-NEXT:    andl $1, %ebx
-; AVX512DQ-NEXT:    leaq (%rbx,%rbx,2), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%rbx,4), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%rbx,8), %rax
-; AVX512DQ-NEXT:    movq %rbx, %r12
-; AVX512DQ-NEXT:    shlq $4, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movzbl %bpl, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %r14
-; AVX512DQ-NEXT:    shlq $5, %r14
-; AVX512DQ-NEXT:    orq %r12, %r14
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $7, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $9, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $10, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k3
-; AVX512DQ-NEXT:    shlq $11, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movzbl %bpl, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $13, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $15, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r13, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $17, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k3
-; AVX512DQ-NEXT:    shlq $18, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r13d
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $19, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $20, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $21, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $22, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $23, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r13, %r12
-; AVX512DQ-NEXT:    shlq $24, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $25, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movzbl %r15b, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $26, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $27, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $28, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %rax, %r13
-; AVX512DQ-NEXT:    shlq $29, %r13
-; AVX512DQ-NEXT:    orq %r12, %r13
-; AVX512DQ-NEXT:    movq %rax, %r12
-; AVX512DQ-NEXT:    shlq $30, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k3
-; AVX512DQ-NEXT:    shlq $31, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    kmovw %k3, %r12d
-; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k3
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    shlq $32, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $33, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $34, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $35, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $36, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $37, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $38, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %r15d
-; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k3
-; AVX512DQ-NEXT:    shlq $39, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $40, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $41, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $42, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $43, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $44, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r13
-; AVX512DQ-NEXT:    shlq $45, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %r12d
-; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k3
-; AVX512DQ-NEXT:    shlq $46, %r15
-; AVX512DQ-NEXT:    orq %r13, %r15
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $48, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $49, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $50, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $51, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $52, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %r15d
-; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k3
-; AVX512DQ-NEXT:    shlq $53, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $54, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $55, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $56, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $57, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $58, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $59, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $60, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    movl %ebp, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $61, %r13
-; AVX512DQ-NEXT:    orq %r15, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %ebp
-; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k3
-; AVX512DQ-NEXT:    shlq $62, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    movq %rax, %r15
-; AVX512DQ-NEXT:    shlq $63, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    kmovw %k3, %r12d
-; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k3
-; AVX512DQ-NEXT:    orq %r14, %r15
-; AVX512DQ-NEXT:    movq %r15, 16(%rsi)
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r14
-; AVX512DQ-NEXT:    leaq (%r14,%rax,4), %r14
-; AVX512DQ-NEXT:    leaq (%r14,%rax,8), %rax
-; AVX512DQ-NEXT:    movzbl %r12b, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $4, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %r14
-; AVX512DQ-NEXT:    shlq $5, %r14
-; AVX512DQ-NEXT:    orq %r15, %r14
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $6, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $7, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $8, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $9, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k3
-; AVX512DQ-NEXT:    shlq $10, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $11, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $12, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $13, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $14, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $15, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $16, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k3
-; AVX512DQ-NEXT:    shlq $17, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $19, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $20, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $21, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $22, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $23, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k3
-; AVX512DQ-NEXT:    shlq $24, %r12
-; AVX512DQ-NEXT:    orq %r15, %r12
-; AVX512DQ-NEXT:    movzbl %al, %r15d
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $25, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $26, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $27, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $28, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $29, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $30, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    shlq $31, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    kmovw %k2, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $33, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $35, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $37, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k3, %r15d
-; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k3
-; AVX512DQ-NEXT:    shlq $38, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $39, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $40, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $41, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $42, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $43, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $44, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k3, %eax
-; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k2
-; AVX512DQ-NEXT:    shlq $45, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    movzbl %al, %r12d
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $46, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $47, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $48, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r15
-; AVX512DQ-NEXT:    shlq $49, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $50, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %r13
-; AVX512DQ-NEXT:    shlq $51, %r13
-; AVX512DQ-NEXT:    orq %rax, %r13
-; AVX512DQ-NEXT:    kmovw %k2, %r15d
-; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k1
-; AVX512DQ-NEXT:    shlq $52, %r12
-; AVX512DQ-NEXT:    orq %r13, %r12
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $53, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $54, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $56, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %r12, %rax
-; AVX512DQ-NEXT:    movq %r15, %r12
-; AVX512DQ-NEXT:    shlq $58, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k1, %r13d
-; AVX512DQ-NEXT:    shlq $59, %r15
-; AVX512DQ-NEXT:    orq %r12, %r15
-; AVX512DQ-NEXT:    andl $1, %r9d
-; AVX512DQ-NEXT:    movq %r9, %rax
-; AVX512DQ-NEXT:    shlq $60, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r9, %r12
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %rax, %r12
-; AVX512DQ-NEXT:    kmovw %k0, %r15d
-; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
-; AVX512DQ-NEXT:    shlq $62, %r9
-; AVX512DQ-NEXT:    orq %r12, %r9
-; AVX512DQ-NEXT:    kmovw %k0, %r12d
-; AVX512DQ-NEXT:    shlq $63, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    orq %r14, %r10
-; AVX512DQ-NEXT:    movq %r10, 24(%rsi)
-; AVX512DQ-NEXT:    andl $1, %r8d
-; AVX512DQ-NEXT:    leaq (%r8,%r8,2), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%r8,4), %rax
-; AVX512DQ-NEXT:    leaq (%rax,%r8,8), %rax
-; AVX512DQ-NEXT:    movq %r8, %r9
-; AVX512DQ-NEXT:    shlq $4, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    shlq $5, %r8
-; AVX512DQ-NEXT:    orq %r9, %r8
-; AVX512DQ-NEXT:    movzbl %bpl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $6, %r9
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $7, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $8, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $9, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    movq %rax, %r9
-; AVX512DQ-NEXT:    shlq $10, %r9
-; AVX512DQ-NEXT:    orq %r10, %r9
-; AVX512DQ-NEXT:    movq %rax, %r10
-; AVX512DQ-NEXT:    shlq $11, %r10
-; AVX512DQ-NEXT:    orq %r9, %r10
-; AVX512DQ-NEXT:    shlq $12, %rax
-; AVX512DQ-NEXT:    orq %r10, %rax
-; AVX512DQ-NEXT:    movzbl %dil, %edi
-; AVX512DQ-NEXT:    andl $1, %edi
-; AVX512DQ-NEXT:    movq %rdi, %r9
-; AVX512DQ-NEXT:    shlq $13, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $14, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %rdi, %r9
-; AVX512DQ-NEXT:    shlq $15, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $16, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    movq %rdi, %r9
-; AVX512DQ-NEXT:    shlq $17, %r9
-; AVX512DQ-NEXT:    orq %rax, %r9
-; AVX512DQ-NEXT:    movq %rdi, %rax
-; AVX512DQ-NEXT:    shlq $18, %rax
-; AVX512DQ-NEXT:    orq %r9, %rax
-; AVX512DQ-NEXT:    shlq $19, %rdi
-; AVX512DQ-NEXT:    orq %rax, %rdi
-; AVX512DQ-NEXT:    movzbl %dl, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $20, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $21, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $22, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $23, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $24, %rdx
-; AVX512DQ-NEXT:    orq %rdi, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rdi
-; AVX512DQ-NEXT:    shlq $25, %rdi
-; AVX512DQ-NEXT:    orq %rdx, %rdi
-; AVX512DQ-NEXT:    shlq $26, %rax
-; AVX512DQ-NEXT:    orq %rdi, %rax
-; AVX512DQ-NEXT:    movzbl %cl, %ecx
-; AVX512DQ-NEXT:    andl $1, %ecx
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $27, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $28, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $29, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $30, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    movq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $31, %rdx
-; AVX512DQ-NEXT:    orq %rax, %rdx
-; AVX512DQ-NEXT:    movq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $32, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    shlq $33, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    andl $1, %r13d
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $34, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $35, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $36, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $37, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r13, %rax
-; AVX512DQ-NEXT:    shlq $38, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r13, %rcx
-; AVX512DQ-NEXT:    shlq $39, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $40, %r13
-; AVX512DQ-NEXT:    orq %rcx, %r13
-; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $41, %rcx
-; AVX512DQ-NEXT:    orq %r13, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $42, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $43, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $44, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    movq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $45, %rcx
-; AVX512DQ-NEXT:    orq %rdx, %rcx
-; AVX512DQ-NEXT:    movq %rax, %rdx
-; AVX512DQ-NEXT:    shlq $46, %rdx
-; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    shlq $47, %rax
-; AVX512DQ-NEXT:    orq %rdx, %rax
-; AVX512DQ-NEXT:    andl $1, %r15d
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $48, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $49, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $50, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $51, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r15, %rcx
-; AVX512DQ-NEXT:    shlq $52, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r15, %rax
-; AVX512DQ-NEXT:    shlq $53, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    shlq $54, %r15
-; AVX512DQ-NEXT:    orq %rax, %r15
-; AVX512DQ-NEXT:    andl $1, %r12d
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $55, %rax
-; AVX512DQ-NEXT:    orq %r15, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $56, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $57, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $58, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    movq %r12, %rax
-; AVX512DQ-NEXT:    shlq $59, %rax
-; AVX512DQ-NEXT:    orq %rcx, %rax
-; AVX512DQ-NEXT:    movq %r12, %rcx
-; AVX512DQ-NEXT:    shlq $60, %rcx
-; AVX512DQ-NEXT:    orq %rax, %rcx
-; AVX512DQ-NEXT:    shlq $61, %r12
-; AVX512DQ-NEXT:    orq %rcx, %r12
-; AVX512DQ-NEXT:    shlq $62, %rbx
-; AVX512DQ-NEXT:    orq %r12, %rbx
-; AVX512DQ-NEXT:    shlq $63, %r11
-; AVX512DQ-NEXT:    orq %rbx, %r11
-; AVX512DQ-NEXT:    orq %r8, %r11
-; AVX512DQ-NEXT:    movq %r11, 8(%rsi)
-; AVX512DQ-NEXT:    popq %rbx
-; AVX512DQ-NEXT:    popq %r12
-; AVX512DQ-NEXT:    popq %r13
-; AVX512DQ-NEXT:    popq %r14
-; AVX512DQ-NEXT:    popq %r15
-; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm5
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm2, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm9, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm11, %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm13, %zmm14
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm15, %zmm16
+; AVX512DQ-NEXT:    vpermd %zmm3, %zmm1, %zmm3
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm2, %zmm17
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm7, %zmm18
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm9, %zmm19
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm11, %zmm20
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm13, %zmm21
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm15, %zmm22
+; AVX512DQ-NEXT:    vpermd %zmm4, %zmm1, %zmm4
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm2, %zmm23
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm7, %zmm24
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm9, %zmm25
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm11, %zmm26
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm13, %zmm27
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm5, %zmm15, %zmm5
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm11
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm13
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm15, %zmm0
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm15 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm13 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
+; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm27, %k1
+; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm27 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm26, %k1
+; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm26 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm25, %k1
+; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm25 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm24, %k1
+; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm24 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm23, %k1
+; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm23 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm22, %k1
+; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm22 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm21, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1024(%rsi), %zmm21 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm20, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1088(%rsi), %zmm20 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm19, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1152(%rsi), %zmm19 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm18, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1216(%rsi), %zmm18 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm17, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1280(%rsi), %zmm17 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1344(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm16, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1408(%rsi), %zmm16 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1472(%rsi), %zmm14 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1536(%rsi), %zmm12 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1600(%rsi), %zmm10 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1664(%rsi), %zmm8 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1728(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 1728(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 1664(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 1600(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 1536(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 1472(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 1408(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 1344(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 1280(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 1216(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 1152(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 1088(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 1024(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 960(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 896(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 832(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 768(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 704(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 640(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm27, 576(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 512(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, (%rdx)
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor7_vf64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r15
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
-; AVX512BW-NEXT:    pushq %r12
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    kmovq (%rdi), %k0
-; AVX512BW-NEXT:    kshiftrq $13, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT:    kshiftrq $12, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ecx
-; AVX512BW-NEXT:    kshiftrq $11, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edx
-; AVX512BW-NEXT:    kshiftrq $10, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %edi
-; AVX512BW-NEXT:    kshiftrq $45, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrq $37, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $59, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrq $56, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrq $54, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r11d
-; AVX512BW-NEXT:    kshiftrq $4, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $1, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $2, %k0, %k1
-; AVX512BW-NEXT:    kmovd %k0, %r14d
-; AVX512BW-NEXT:    movzbl %r14b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    leaq (%r12,%r12,2), %r14
-; AVX512BW-NEXT:    leaq (%r14,%r12,4), %r14
-; AVX512BW-NEXT:    leaq (%r14,%r12,8), %r14
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $4, %r13
-; AVX512BW-NEXT:    orq %r14, %r13
-; AVX512BW-NEXT:    movq %r12, %r14
-; AVX512BW-NEXT:    shlq $5, %r14
-; AVX512BW-NEXT:    orq %r13, %r14
-; AVX512BW-NEXT:    shlq $6, %r12
-; AVX512BW-NEXT:    movzbl %r15b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $7, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $8, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $9, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $10, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $12, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $3, %k0, %k1
-; AVX512BW-NEXT:    shlq $13, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %r13b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $14, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $15, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $16, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $17, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $18, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $19, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $55, %k0, %k1
-; AVX512BW-NEXT:    shlq $20, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %r13b, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $21, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $22, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $23, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $24, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $25, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $26, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    shlq $27, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $28, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $29, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $30, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $5, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $32, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %rax, %r8
-; AVX512BW-NEXT:    shlq $33, %r8
-; AVX512BW-NEXT:    orq %r13, %r8
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %r8
-; AVX512BW-NEXT:    shlq $35, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %r12, %r8
-; AVX512BW-NEXT:    shlq $37, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %r12, %r8
-; AVX512BW-NEXT:    shlq $39, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r8d
-; AVX512BW-NEXT:    kshiftrq $7, %k0, %k1
-; AVX512BW-NEXT:    shlq $41, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r8, %r12
-; AVX512BW-NEXT:    shlq $43, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r8, %r12
-; AVX512BW-NEXT:    shlq $45, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r8, %r13
-; AVX512BW-NEXT:    shlq $47, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $8, %k0, %k1
-; AVX512BW-NEXT:    shlq $48, %r8
-; AVX512BW-NEXT:    orq %r13, %r8
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $49, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %r12, %r8
-; AVX512BW-NEXT:    shlq $50, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $51, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %r12, %r8
-; AVX512BW-NEXT:    shlq $52, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $53, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %r12, %r8
-; AVX512BW-NEXT:    shlq $54, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $9, %k0, %k1
-; AVX512BW-NEXT:    shlq $55, %r12
-; AVX512BW-NEXT:    orq %r8, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r8
-; AVX512BW-NEXT:    shlq $57, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %r13, %r8
-; AVX512BW-NEXT:    shlq $59, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %r8, %rax
-; AVX512BW-NEXT:    movq %r13, %r8
-; AVX512BW-NEXT:    shlq $61, %r8
-; AVX512BW-NEXT:    orq %rax, %r8
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $57, %k0, %k1
-; AVX512BW-NEXT:    shlq $62, %r13
-; AVX512BW-NEXT:    orq %r8, %r13
-; AVX512BW-NEXT:    movzbl %al, %r8d
-; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    shlq $63, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %rax, (%rsi)
-; AVX512BW-NEXT:    movzbl %r15b, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movzbl %r11b, %r14d
-; AVX512BW-NEXT:    movl %r14d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%r12,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r12,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r12,8), %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $4, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $5, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movzbl %r10b, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $11, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $13, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $58, %k0, %k1
-; AVX512BW-NEXT:    shlq $14, %r10
-; AVX512BW-NEXT:    orq %r12, %r10
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $15, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $16, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $17, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $18, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $19, %rax
-; AVX512BW-NEXT:    orq %r10, %rax
-; AVX512BW-NEXT:    movq %r12, %r10
-; AVX512BW-NEXT:    shlq $20, %r10
-; AVX512BW-NEXT:    orq %rax, %r10
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $63, %k0, %k1
-; AVX512BW-NEXT:    shlq $21, %r12
-; AVX512BW-NEXT:    orq %r10, %r12
-; AVX512BW-NEXT:    movzbl %al, %r10d
-; AVX512BW-NEXT:    andl $1, %r10d
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $23, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $25, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r10, %r12
-; AVX512BW-NEXT:    shlq $27, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    shlq $28, %r10
-; AVX512BW-NEXT:    orq %r12, %r10
-; AVX512BW-NEXT:    movzbl %r9b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    shlq $29, %r12
-; AVX512BW-NEXT:    orq %r10, %r12
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $30, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r10d
-; AVX512BW-NEXT:    kshiftrq $60, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %r12
-; AVX512BW-NEXT:    shlq $32, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $34, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $61, %k0, %k1
-; AVX512BW-NEXT:    shlq $35, %r9
-; AVX512BW-NEXT:    orq %r13, %r9
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r12, %r9
-; AVX512BW-NEXT:    shlq $37, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r12, %r9
-; AVX512BW-NEXT:    shlq $39, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $41, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrq $62, %k0, %k1
-; AVX512BW-NEXT:    shlq $42, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $43, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r9, %r12
-; AVX512BW-NEXT:    shlq $44, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $45, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r9, %r12
-; AVX512BW-NEXT:    shlq $46, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $47, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $48, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $36, %k0, %k1
-; AVX512BW-NEXT:    shlq $49, %r9
-; AVX512BW-NEXT:    orq %r13, %r9
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r12, %r9
-; AVX512BW-NEXT:    shlq $51, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r12, %r9
-; AVX512BW-NEXT:    shlq $53, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r12, %r9
-; AVX512BW-NEXT:    shlq $55, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    shlq $56, %r12
-; AVX512BW-NEXT:    orq %r9, %r12
-; AVX512BW-NEXT:    movl %r10d, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $57, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r9
-; AVX512BW-NEXT:    shlq $58, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $59, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movq %r13, %r9
-; AVX512BW-NEXT:    shlq $60, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $61, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r9d
-; AVX512BW-NEXT:    kshiftrq $38, %k0, %k1
-; AVX512BW-NEXT:    movzbl %r10b, %r10d
-; AVX512BW-NEXT:    shlq $62, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    shlq $63, %r10
-; AVX512BW-NEXT:    orq %r13, %r10
-; AVX512BW-NEXT:    orq %r15, %r10
-; AVX512BW-NEXT:    movq %r10, 48(%rsi)
-; AVX512BW-NEXT:    movzbl %r9b, %r10d
-; AVX512BW-NEXT:    movl %r10d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r15
-; AVX512BW-NEXT:    leaq (%r15,%rax,4), %rax
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    leaq (%rax,%r13,8), %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $4, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $5, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $7, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $39, %k0, %k1
-; AVX512BW-NEXT:    shlq $9, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $13, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $15, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $40, %k0, %k1
-; AVX512BW-NEXT:    shlq $16, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $17, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $18, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $19, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $20, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $21, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $22, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $41, %k0, %k1
-; AVX512BW-NEXT:    shlq $23, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movzbl %al, %ebp
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $24, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbp, %r15
-; AVX512BW-NEXT:    shlq $25, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %rbp, %r15
-; AVX512BW-NEXT:    shlq $27, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $28, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %rbp, %r13
-; AVX512BW-NEXT:    shlq $29, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $42, %k0, %k1
-; AVX512BW-NEXT:    shlq $30, %rbp
-; AVX512BW-NEXT:    orq %r13, %rbp
-; AVX512BW-NEXT:    movzbl %r15b, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    shlq $31, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $32, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $34, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %rbp
-; AVX512BW-NEXT:    shlq $36, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $43, %k0, %k1
-; AVX512BW-NEXT:    shlq $37, %r15
-; AVX512BW-NEXT:    orq %rbp, %r15
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $39, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $41, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $43, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $44, %k0, %k1
-; AVX512BW-NEXT:    shlq $44, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $45, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $46, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $47, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $48, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $49, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %rbp
-; AVX512BW-NEXT:    shlq $50, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $22, %k0, %k1
-; AVX512BW-NEXT:    shlq $51, %r15
-; AVX512BW-NEXT:    orq %rbp, %r15
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $53, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $55, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $57, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    shlq $58, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movl %ebx, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    shlq $59, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $60, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    movq %rax, %rbp
-; AVX512BW-NEXT:    shlq $61, %rbp
-; AVX512BW-NEXT:    orq %r13, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $46, %k0, %k1
-; AVX512BW-NEXT:    movzbl %bl, %ebx
-; AVX512BW-NEXT:    shlq $62, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $63, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $47, %k0, %k1
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movq %r13, 32(%rsi)
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    leaq (%rbx,%rbx,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r13,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r13,8), %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $4, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $5, %r12
-; AVX512BW-NEXT:    orq %rbx, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $7, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $48, %k0, %k1
-; AVX512BW-NEXT:    shlq $8, %r13
-; AVX512BW-NEXT:    orq %rbx, %r13
-; AVX512BW-NEXT:    movzbl %al, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $9, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $10, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $11, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $12, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $13, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $14, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $49, %k0, %k1
-; AVX512BW-NEXT:    shlq $15, %rbx
-; AVX512BW-NEXT:    orq %r13, %rbx
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $17, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $19, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $21, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $50, %k0, %k1
-; AVX512BW-NEXT:    shlq $22, %r13
-; AVX512BW-NEXT:    orq %rbx, %r13
-; AVX512BW-NEXT:    movzbl %al, %ebx
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $23, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $24, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $25, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $26, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $27, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $28, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $51, %k0, %k1
-; AVX512BW-NEXT:    shlq $29, %rbx
-; AVX512BW-NEXT:    orq %r13, %rbx
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $31, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $33, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $35, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $52, %k0, %k1
-; AVX512BW-NEXT:    shlq $36, %r13
-; AVX512BW-NEXT:    orq %rbx, %r13
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $37, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbp, %rbx
-; AVX512BW-NEXT:    shlq $38, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $39, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rbx
-; AVX512BW-NEXT:    shlq $40, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $41, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %rbp, %r13
-; AVX512BW-NEXT:    shlq $42, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrq $53, %k0, %k1
-; AVX512BW-NEXT:    shlq $43, %rbp
-; AVX512BW-NEXT:    orq %r13, %rbp
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $45, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $47, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    shlq $49, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $19, %k0, %k1
-; AVX512BW-NEXT:    shlq $50, %rbx
-; AVX512BW-NEXT:    orq %r13, %rbx
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $51, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $52, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $53, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $54, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $55, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r13, %rbx
-; AVX512BW-NEXT:    shlq $56, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $18, %k0, %k1
-; AVX512BW-NEXT:    shlq $57, %r13
-; AVX512BW-NEXT:    orq %rbx, %r13
-; AVX512BW-NEXT:    andl $1, %r11d
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r11, %rbx
-; AVX512BW-NEXT:    shlq $59, %rbx
-; AVX512BW-NEXT:    orq %rax, %rbx
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %rbx, %rax
-; AVX512BW-NEXT:    movq %r11, %r13
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebx
-; AVX512BW-NEXT:    kshiftrq $20, %k0, %k1
-; AVX512BW-NEXT:    shlq $62, %r11
-; AVX512BW-NEXT:    orq %r13, %r11
-; AVX512BW-NEXT:    shlq $63, %r14
-; AVX512BW-NEXT:    orq %r11, %r14
-; AVX512BW-NEXT:    orq %r12, %r14
-; AVX512BW-NEXT:    movq %r14, 40(%rsi)
-; AVX512BW-NEXT:    movzbl %bl, %r11d
-; AVX512BW-NEXT:    movl %r11d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r14
-; AVX512BW-NEXT:    leaq (%r14,%rax,4), %r14
-; AVX512BW-NEXT:    leaq (%r14,%rax,8), %r14
-; AVX512BW-NEXT:    shlq $4, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movzbl %bpl, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %r14
-; AVX512BW-NEXT:    shlq $5, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $10, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $21, %k0, %k1
-; AVX512BW-NEXT:    shlq $11, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movzbl %bpl, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $13, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $15, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $17, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $27, %k0, %k1
-; AVX512BW-NEXT:    shlq $18, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    movzbl %al, %r13d
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $19, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $20, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $21, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $22, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $23, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $24, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    shlq $25, %r13
-; AVX512BW-NEXT:    orq %r12, %r13
-; AVX512BW-NEXT:    movzbl %r15b, %ebp
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %rbp, %r12
-; AVX512BW-NEXT:    shlq $27, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $28, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %rbp, %r12
-; AVX512BW-NEXT:    shlq $29, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $23, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $24, %k0, %k1
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    shlq $32, %r15
-; AVX512BW-NEXT:    orq %rbp, %r15
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $33, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $34, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $35, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $36, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $37, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $38, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $25, %k0, %k1
-; AVX512BW-NEXT:    shlq $39, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $40, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $41, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $42, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $43, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $44, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %rbp
-; AVX512BW-NEXT:    shlq $45, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $26, %k0, %k1
-; AVX512BW-NEXT:    shlq $46, %r15
-; AVX512BW-NEXT:    orq %rbp, %r15
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $47, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $48, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $49, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %r15
-; AVX512BW-NEXT:    shlq $50, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $51, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $52, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $31, %k0, %k1
-; AVX512BW-NEXT:    shlq $53, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $54, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $55, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $56, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $57, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $58, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $59, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    shlq $60, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movl %r12d, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %r15, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $28, %k0, %k1
-; AVX512BW-NEXT:    movzbl %r12b, %r15d
-; AVX512BW-NEXT:    shlq $62, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $63, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $29, %k0, %k1
-; AVX512BW-NEXT:    orq %r14, %r12
-; AVX512BW-NEXT:    movq %r12, 16(%rsi)
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    leaq (%r15,%r15,2), %r14
-; AVX512BW-NEXT:    leaq (%r14,%r15,4), %r14
-; AVX512BW-NEXT:    leaq (%r14,%r15,8), %r14
-; AVX512BW-NEXT:    movzbl %al, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $4, %rax
-; AVX512BW-NEXT:    orq %r14, %rax
-; AVX512BW-NEXT:    movq %r15, %r14
-; AVX512BW-NEXT:    shlq $5, %r14
-; AVX512BW-NEXT:    orq %rax, %r14
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $6, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $7, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $8, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $9, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $30, %k0, %k1
-; AVX512BW-NEXT:    shlq $10, %r15
-; AVX512BW-NEXT:    orq %r12, %r15
-; AVX512BW-NEXT:    movzbl %al, %r12d
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $11, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $12, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $13, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $14, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $15, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $16, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512BW-NEXT:    shlq $17, %r12
-; AVX512BW-NEXT:    orq %r15, %r12
-; AVX512BW-NEXT:    movzbl %al, %r15d
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $19, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $20, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r12
-; AVX512BW-NEXT:    shlq $21, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $22, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $23, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $33, %k0, %k1
-; AVX512BW-NEXT:    shlq $24, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    movzbl %bpl, %ebp
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $25, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %rbp, %r15
-; AVX512BW-NEXT:    shlq $26, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $27, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %rbp, %r15
-; AVX512BW-NEXT:    shlq $28, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $29, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %rbp, %r15
-; AVX512BW-NEXT:    shlq $30, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    kmovd %k1, %r13d
-; AVX512BW-NEXT:    kshiftrq $34, %k0, %k1
-; AVX512BW-NEXT:    shlq $31, %rbp
-; AVX512BW-NEXT:    orq %r15, %rbp
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $33, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $35, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %rbp
-; AVX512BW-NEXT:    shlq $37, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $35, %k0, %k1
-; AVX512BW-NEXT:    shlq $38, %r12
-; AVX512BW-NEXT:    orq %rbp, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $39, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $40, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $41, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %r12
-; AVX512BW-NEXT:    shlq $42, %r12
-; AVX512BW-NEXT:    orq %rax, %r12
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $43, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rbp
-; AVX512BW-NEXT:    shlq $44, %rbp
-; AVX512BW-NEXT:    orq %rax, %rbp
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $14, %k0, %k1
-; AVX512BW-NEXT:    shlq $45, %r13
-; AVX512BW-NEXT:    orq %rbp, %r13
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $46, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $47, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $49, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %r13, %rax
-; AVX512BW-NEXT:    movq %r15, %r13
-; AVX512BW-NEXT:    shlq $51, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %ebp
-; AVX512BW-NEXT:    kshiftrq $15, %k0, %k1
-; AVX512BW-NEXT:    shlq $52, %r15
-; AVX512BW-NEXT:    orq %r13, %r15
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $53, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $54, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $55, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r15
-; AVX512BW-NEXT:    shlq $56, %r15
-; AVX512BW-NEXT:    orq %rax, %r15
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $57, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %r13
-; AVX512BW-NEXT:    shlq $58, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r15d
-; AVX512BW-NEXT:    kshiftrq $16, %k0, %k1
-; AVX512BW-NEXT:    shlq $59, %r12
-; AVX512BW-NEXT:    orq %r13, %r12
-; AVX512BW-NEXT:    andl $1, %r9d
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    shlq $60, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r9, %r13
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %rax, %r13
-; AVX512BW-NEXT:    kmovd %k1, %r12d
-; AVX512BW-NEXT:    kshiftrq $17, %k0, %k0
-; AVX512BW-NEXT:    shlq $62, %r9
-; AVX512BW-NEXT:    orq %r13, %r9
-; AVX512BW-NEXT:    kmovd %k0, %r13d
-; AVX512BW-NEXT:    shlq $63, %r10
-; AVX512BW-NEXT:    orq %r9, %r10
-; AVX512BW-NEXT:    orq %r14, %r10
-; AVX512BW-NEXT:    movq %r10, 24(%rsi)
-; AVX512BW-NEXT:    andl $1, %r8d
-; AVX512BW-NEXT:    leaq (%r8,%r8,2), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r8,4), %rax
-; AVX512BW-NEXT:    leaq (%rax,%r8,8), %rax
-; AVX512BW-NEXT:    movq %r8, %r9
-; AVX512BW-NEXT:    shlq $4, %r9
-; AVX512BW-NEXT:    orq %rax, %r9
-; AVX512BW-NEXT:    shlq $5, %r8
-; AVX512BW-NEXT:    orq %r9, %r8
-; AVX512BW-NEXT:    movzbl %dil, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $6, %rdi
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $7, %r9
-; AVX512BW-NEXT:    orq %rdi, %r9
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $8, %rdi
-; AVX512BW-NEXT:    orq %r9, %rdi
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $9, %r9
-; AVX512BW-NEXT:    orq %rdi, %r9
-; AVX512BW-NEXT:    movq %rax, %rdi
-; AVX512BW-NEXT:    shlq $10, %rdi
-; AVX512BW-NEXT:    orq %r9, %rdi
-; AVX512BW-NEXT:    movq %rax, %r9
-; AVX512BW-NEXT:    shlq $11, %r9
-; AVX512BW-NEXT:    orq %rdi, %r9
-; AVX512BW-NEXT:    shlq $12, %rax
-; AVX512BW-NEXT:    orq %r9, %rax
-; AVX512BW-NEXT:    movzbl %dl, %edx
-; AVX512BW-NEXT:    andl $1, %edx
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $13, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $14, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $15, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $16, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    movq %rdx, %rdi
-; AVX512BW-NEXT:    shlq $17, %rdi
-; AVX512BW-NEXT:    orq %rax, %rdi
-; AVX512BW-NEXT:    movq %rdx, %rax
-; AVX512BW-NEXT:    shlq $18, %rax
-; AVX512BW-NEXT:    orq %rdi, %rax
-; AVX512BW-NEXT:    shlq $19, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movzbl %cl, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $20, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $21, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $22, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $23, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shlq $24, %rcx
-; AVX512BW-NEXT:    orq %rdx, %rcx
-; AVX512BW-NEXT:    movq %rax, %rdx
-; AVX512BW-NEXT:    shlq $25, %rdx
-; AVX512BW-NEXT:    orq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $26, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; AVX512BW-NEXT:    andl $1, %ecx
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $27, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $28, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $29, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $30, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    movq %rcx, %rdx
-; AVX512BW-NEXT:    shlq $31, %rdx
-; AVX512BW-NEXT:    orq %rax, %rdx
-; AVX512BW-NEXT:    movq %rcx, %rax
-; AVX512BW-NEXT:    shlq $32, %rax
-; AVX512BW-NEXT:    orq %rdx, %rax
-; AVX512BW-NEXT:    shlq $33, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    andl $1, %ebp
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $34, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rcx
-; AVX512BW-NEXT:    shlq $35, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $36, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rcx
-; AVX512BW-NEXT:    shlq $37, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    shlq $38, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %rbp, %rcx
-; AVX512BW-NEXT:    shlq $39, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $40, %rbp
-; AVX512BW-NEXT:    orq %rcx, %rbp
-; AVX512BW-NEXT:    andl $1, %r15d
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $41, %rax
-; AVX512BW-NEXT:    orq %rbp, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $42, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $43, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $44, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    shlq $45, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r15, %rcx
-; AVX512BW-NEXT:    shlq $46, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $47, %r15
-; AVX512BW-NEXT:    orq %rcx, %r15
-; AVX512BW-NEXT:    andl $1, %r12d
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $48, %rax
-; AVX512BW-NEXT:    orq %r15, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $49, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $50, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $51, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r12, %rax
-; AVX512BW-NEXT:    shlq $52, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r12, %rcx
-; AVX512BW-NEXT:    shlq $53, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $54, %r12
-; AVX512BW-NEXT:    orq %rcx, %r12
-; AVX512BW-NEXT:    andl $1, %r13d
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $55, %rax
-; AVX512BW-NEXT:    orq %r12, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $56, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $57, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $58, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    movq %r13, %rax
-; AVX512BW-NEXT:    shlq $59, %rax
-; AVX512BW-NEXT:    orq %rcx, %rax
-; AVX512BW-NEXT:    movq %r13, %rcx
-; AVX512BW-NEXT:    shlq $60, %rcx
-; AVX512BW-NEXT:    orq %rax, %rcx
-; AVX512BW-NEXT:    shlq $61, %r13
-; AVX512BW-NEXT:    orq %rcx, %r13
-; AVX512BW-NEXT:    andl $1, %ebx
-; AVX512BW-NEXT:    shlq $62, %rbx
-; AVX512BW-NEXT:    orq %r13, %rbx
-; AVX512BW-NEXT:    shlq $63, %r11
-; AVX512BW-NEXT:    orq %rbx, %r11
-; AVX512BW-NEXT:    orq %r8, %r11
-; AVX512BW-NEXT:    movq %r11, 8(%rsi)
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %r15
-; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    kmovq (%rdi), %k4
+; AVX512BW-NEXT:    movw $-3, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k0, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k4, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-5, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-9, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-17, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-33, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-65, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    movw $-129, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $1, %k4, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-257, %ax # imm = 0xFEFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k2, %k5
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-513, %ax # imm = 0xFDFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-1025, %ax # imm = 0xFBFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-4097, %ax # imm = 0xEFFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k3
+; AVX512BW-NEXT:    korw %k3, %k0, %k0
+; AVX512BW-NEXT:    movw $-8193, %ax # imm = 0xDFFF
+; AVX512BW-NEXT:    kmovd %eax, %k2
+; AVX512BW-NEXT:    kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    movw $-16385, %ax # imm = 0xBFFF
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $2, %k4, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k0, %k6
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k6} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k7, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k7, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k7, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k7, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kmovq %k4, %k7
+; AVX512BW-NEXT:    kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    kshiftrq $3, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $4, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    kandw %k3, %k6, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k1
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $5, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $6, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k5, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT:    korw %k1, %k6, %k1
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $7, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $8, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $9, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k1
+; AVX512BW-NEXT:    kmovq %k5, %k4
+; AVX512BW-NEXT:    kshiftrq $10, %k5, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $11, %k4, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $12, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $13, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k5, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT:    korw %k1, %k6, %k6
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k6} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $14, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $15, %k4, %k1
+; AVX512BW-NEXT:    kmovq %k4, %k3
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k6, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
+; AVX512BW-NEXT:    kmovq %k3, %k2
+; AVX512BW-NEXT:    kshiftrq $16, %k3, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $17, %k2, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k7, %k0, %k0
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $18, %k4, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $19, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $20, %k4, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm8 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $21, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $22, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k5, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT:    korw %k1, %k6, %k1
+; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $23, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $24, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $25, %k4, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $26, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $27, %k4, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm11 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k6, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $28, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $29, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT:    korw %k1, %k6, %k6
+; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm12 {%k6} {z}
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $30, %k4, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $31, %k4, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k5, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k6, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm13 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k4, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $33, %k4, %k0
+; AVX512BW-NEXT:    kmovq %k4, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $34, %k7, %k1
+; AVX512BW-NEXT:    kmovq %k7, %k3
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm14 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k1
+; AVX512BW-NEXT:    kmovq %k3, %k7
+; AVX512BW-NEXT:    kshiftrq $35, %k3, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $36, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm15 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k1
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $37, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $38, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT:    korw %k1, %k6, %k1
+; AVX512BW-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k0, %k1
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $39, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $40, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $41, %k5, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 1088(%rsi), %zmm17 {%k7} {z}
+; AVX512BW-NEXT:    kandw %k3, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k1
+; AVX512BW-NEXT:    kmovq %k5, %k7
+; AVX512BW-NEXT:    kshiftrq $42, %k5, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $43, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1152(%rsi), %zmm18 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $44, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $45, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT:    korw %k1, %k6, %k6
+; AVX512BW-NEXT:    vmovdqa32 1216(%rsi), %zmm19 {%k6} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $46, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $47, %k7, %k1
+; AVX512BW-NEXT:    kmovq %k7, %k4
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k2, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k6, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1280(%rsi), %zmm20 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $48, %k4, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $49, %k4, %k0
+; AVX512BW-NEXT:    kmovq %k4, %k7
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $50, %k7, %k1
+; AVX512BW-NEXT:    kmovq %k7, %k3
+; AVX512BW-NEXT:    kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 1344(%rsi), %zmm21 {%k7} {z}
+; AVX512BW-NEXT:    kandw %k2, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k5, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k0, %k1
+; AVX512BW-NEXT:    kmovq %k3, %k7
+; AVX512BW-NEXT:    kshiftrq $51, %k3, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $52, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1408(%rsi), %zmm22 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k0, %k1
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $53, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $54, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT:    korw %k1, %k6, %k1
+; AVX512BW-NEXT:    vmovdqa32 1472(%rsi), %zmm23 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $55, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $56, %k7, %k0
+; AVX512BW-NEXT:    kmovq %k7, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrq $57, %k2, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT:    korw %k6, %k0, %k7
+; AVX512BW-NEXT:    vmovdqa32 1536(%rsi), %zmm24 {%k7} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k6, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kandw %k4, %k0, %k1
+; AVX512BW-NEXT:    kmovq %k2, %k7
+; AVX512BW-NEXT:    kshiftrq $58, %k2, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $59, %k7, %k6
+; AVX512BW-NEXT:    kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k6, %k7
+; AVX512BW-NEXT:    korw %k7, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1600(%rsi), %zmm25 {%k1} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k6, %k1
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT:    kshiftrq $60, %k7, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kandw %k3, %k0, %k6
+; AVX512BW-NEXT:    kshiftrq $61, %k7, %k0
+; AVX512BW-NEXT:    kmovq %k7, %k2
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kandw %k4, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT:    korw %k7, %k6, %k6
+; AVX512BW-NEXT:    kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT:    kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT:    korw %k1, %k6, %k6
+; AVX512BW-NEXT:    vmovdqa32 1664(%rsi), %zmm26 {%k6} {z}
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftrq $62, %k2, %k0
+; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k6, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    kshiftrq $63, %k2, %k2
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k1, %k0, %k1
+; AVX512BW-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k5, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k6
+; AVX512BW-NEXT:    korw %k6, %k1, %k1
+; AVX512BW-NEXT:    kandw %k4, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k5
+; AVX512BW-NEXT:    korw %k5, %k1, %k1
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k4
+; AVX512BW-NEXT:    korw %k4, %k1, %k1
+; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT:    kandw %k3, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT:    korw %k2, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    korw %k0, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1728(%rsi), %zmm27 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 1728(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1664(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 1600(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 1536(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, 1472(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 1408(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, 1344(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 1280(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1216(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 1152(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 1088(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 960(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 896(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 832(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 768(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 704(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 640(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 512(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <64 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-  store <448 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+  %data = call <448 x i32> @llvm.masked.load.v448i32.p0(ptr %in.vec, i32 64, <448 x i1> %tgt.mask, <448 x i32> poison)
+  store <448 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor8_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf2:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor8_vf2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor8_vf2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    kmovq (%rdi), %k1
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 ; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512BW-NEXT:    kmovw %k0, (%rsi)
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <2 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  store <16 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
+  %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
+  store <16 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor8_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf4:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor8_vf4:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
-; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask_replication_factor8_vf4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    kmovq (%rdi), %k0
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, (%rsi)
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k1
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <4 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  store <32 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
+  store <32 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor8_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf8:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
-; AVX512F-ONLY-NEXT:    kmovw %k3, 6(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k2, 4(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k4
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor8_vf8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
-; AVX512DQ-NEXT:    kmovw %k3, 6(%rsi)
-; AVX512DQ-NEXT:    kmovw %k2, 4(%rsi)
-; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8:
 ; AVX512BW-ONLY:       # %bb.0:
-; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
-; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512BW-ONLY-NEXT:    vzeroupper
 ; AVX512BW-ONLY-NEXT:    retq
 ;
 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8:
 ; AVX512VBMI-ONLY:       # %bb.0:
-; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
 ; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vzeroupper
 ; AVX512VBMI-ONLY-NEXT:    retq
-  %src.vec = load <8 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  store <64 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
+  store <64 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor8_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf16:
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
 ; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k6
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k7
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k7
-; AVX512F-ONLY-NEXT:    kmovw %k7, 12(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k6, 14(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k5, 8(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k4, 10(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k3, 4(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k2, 6(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k7} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512F-ONLY-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
@@ -28715,38 +13178,48 @@ define void @mask_replication_factor8_vf16(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k6
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k7
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k7
-; AVX512DQ-NEXT:    kmovw %k7, 12(%rsi)
-; AVX512DQ-NEXT:    kmovw %k6, 14(%rsi)
-; AVX512DQ-NEXT:    kmovw %k5, 8(%rsi)
-; AVX512DQ-NEXT:    kmovw %k4, 10(%rsi)
-; AVX512DQ-NEXT:    kmovw %k3, 4(%rsi)
-; AVX512DQ-NEXT:    kmovw %k2, 6(%rsi)
-; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k7} {z}
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512DQ-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -28755,41 +13228,63 @@ define void @mask_replication_factor8_vf16(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512BW-NEXT:    kmovw (%rdi), %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512BW-NEXT:    kmovq %k1, 8(%rsi)
-; AVX512BW-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k2
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <16 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-  store <128 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
+  store <128 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor8_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
-; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm4
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm10
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm12
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm14
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm15, %zmm0
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z}
 ; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm1, %zmm1
@@ -28800,61 +13295,77 @@ define void @mask_replication_factor8_vf32(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm11, %zmm11
 ; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm13, %zmm13
 ; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm15, %zmm15
-; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 28(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 30(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 24(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 26(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 20(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 22(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 16(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 18(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 12(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 14(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 8(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 10(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 4(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 6(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm15 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm11 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm14 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm12 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 960(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 896(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 832(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 768(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 640(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 576(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, 512(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, (%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor8_vf32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovw (%rdi), %k0
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm4
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm10
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm14
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm15, %zmm0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm16
 ; AVX512DQ-NEXT:    vpermd %zmm16, %zmm1, %zmm1
@@ -28865,38 +13376,54 @@ define void @mask_replication_factor8_vf32(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512DQ-NEXT:    vpermd %zmm16, %zmm11, %zmm11
 ; AVX512DQ-NEXT:    vpermd %zmm16, %zmm13, %zmm13
 ; AVX512DQ-NEXT:    vpermd %zmm16, %zmm15, %zmm15
-; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 28(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 30(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 24(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 26(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 20(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 22(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 16(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 18(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 12(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 14(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 8(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 10(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 4(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 6(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k0
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm15 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm11 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
+; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm14 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
+; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm12 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
+; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
+; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
+; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 960(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 896(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 832(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 768(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 640(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 576(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 512(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -28904,56 +13431,98 @@ define void @mask_replication_factor8_vf32(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    kmovd (%rdi), %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpmovb2m %zmm3, %k0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3,2,3,2,3]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k3
 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovb2m %zmm0, %k3
-; AVX512BW-NEXT:    kmovq %k3, 24(%rsi)
-; AVX512BW-NEXT:    kmovq %k2, 16(%rsi)
-; AVX512BW-NEXT:    kmovq %k1, 8(%rsi)
-; AVX512BW-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k4
+; AVX512BW-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k4} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k4, %k4
+; AVX512BW-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k3, %k3
+; AVX512BW-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
+; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
+; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
+; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
+; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 896(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 960(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 768(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 832(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 704(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 512(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 576(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <32 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
-  store <256 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
+  %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison)
+  store <256 x i32> %data, ptr %out.vec, align 64
   ret void
 }
 
-define void @mask_replication_factor8_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf64:
 ; AVX512F-ONLY:       # %bb.0:
-; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
-; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm4
 ; AVX512F-ONLY-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm10
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm12
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm14
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm15, %zmm16
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm17, %zmm4
 ; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm3, %zmm18
 ; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm5, %zmm19
@@ -28980,99 +13549,131 @@ define void @mask_replication_factor8_vf64(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm13
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm15, %zmm15
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm17, %zmm0
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 60(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 62(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 56(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 58(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 52(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 54(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 48(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 50(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 44(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm31, %zmm31, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 46(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm30, %zmm30, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 40(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm29, %zmm29, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 42(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm28, %zmm28, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 36(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm27, %zmm27, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 38(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm26, %zmm26, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 32(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm25, %zmm25, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 34(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 28(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm24, %zmm24, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 30(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm23, %zmm23, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 24(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm22, %zmm22, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 26(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm21, %zmm21, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 20(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm20, %zmm20, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 22(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm19, %zmm19, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 16(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm18, %zmm18, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 18(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 12(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm16, %zmm16, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 14(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 8(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 10(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 4(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 6(%rsi)
-; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
-; AVX512F-ONLY-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm15 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm13 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm5 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm3 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm31, %zmm31, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm17 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm30, %zmm30, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm30 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm29, %zmm29, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm29 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm28, %zmm28, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm28 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm27, %zmm27, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm27 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm26, %zmm26, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm26 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm25, %zmm25, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm25 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1024(%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm24, %zmm24, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1088(%rsi), %zmm24 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm23, %zmm23, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1152(%rsi), %zmm23 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm22, %zmm22, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1216(%rsi), %zmm22 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm21, %zmm21, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1280(%rsi), %zmm21 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm20, %zmm20, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1344(%rsi), %zmm20 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm19, %zmm19, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1408(%rsi), %zmm19 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm18, %zmm18, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1472(%rsi), %zmm18 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1536(%rsi), %zmm4 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm16, %zmm16, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1600(%rsi), %zmm16 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1664(%rsi), %zmm14 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1728(%rsi), %zmm12 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1792(%rsi), %zmm10 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1856(%rsi), %zmm8 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1920(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512F-ONLY-NEXT:    vptestmd %zmm31, %zmm31, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm31, 1984(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm6, 1920(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm8, 1856(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm10, 1792(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm12, 1728(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm14, 1664(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm16, 1600(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm4, 1536(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm18, 1472(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm19, 1408(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm20, 1344(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm21, 1280(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm22, 1216(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm23, 1152(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm24, 1088(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm1, 1024(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm25, 960(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm26, 896(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm27, 832(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm28, 768(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm29, 704(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm30, 640(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm17, 576(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm2, 512(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm3, 448(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm5, 384(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm7, 320(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm9, 256(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm11, 192(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm13, 128(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm15, 64(%rdx)
+; AVX512F-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512F-ONLY-NEXT:    vzeroupper
 ; AVX512F-ONLY-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask_replication_factor8_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
-; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm4
 ; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm10
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm12
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm14
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm15, %zmm16
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm17, %zmm4
 ; AVX512DQ-NEXT:    vpermd %zmm1, %zmm3, %zmm18
 ; AVX512DQ-NEXT:    vpermd %zmm1, %zmm5, %zmm19
@@ -29099,71 +13700,103 @@ define void @mask_replication_factor8_vf64(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm13
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm15, %zmm15
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm17, %zmm0
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 60(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 62(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 56(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 58(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 52(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 54(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 48(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 50(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 44(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm31, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 46(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm30, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 40(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm29, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 42(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm28, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 36(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm27, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 38(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm26, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 32(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm25, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 34(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 28(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm24, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 30(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm23, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 24(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm22, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 26(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm21, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 20(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm20, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 22(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm19, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 16(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm18, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 18(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 12(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm16, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 14(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 8(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 10(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 4(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 6(%rsi)
-; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k0
-; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k1
+; AVX512DQ-NEXT:    vmovdqa32 64(%rsi), %zmm15 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k1
+; AVX512DQ-NEXT:    vmovdqa32 128(%rsi), %zmm13 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k1
+; AVX512DQ-NEXT:    vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k1
+; AVX512DQ-NEXT:    vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k1
+; AVX512DQ-NEXT:    vmovdqa32 320(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k1
+; AVX512DQ-NEXT:    vmovdqa32 384(%rsi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa32 448(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512DQ-NEXT:    vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm31, %k1
+; AVX512DQ-NEXT:    vmovdqa32 576(%rsi), %zmm17 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm30, %k1
+; AVX512DQ-NEXT:    vmovdqa32 640(%rsi), %zmm30 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm29, %k1
+; AVX512DQ-NEXT:    vmovdqa32 704(%rsi), %zmm29 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm28, %k1
+; AVX512DQ-NEXT:    vmovdqa32 768(%rsi), %zmm28 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm27, %k1
+; AVX512DQ-NEXT:    vmovdqa32 832(%rsi), %zmm27 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm26, %k1
+; AVX512DQ-NEXT:    vmovdqa32 896(%rsi), %zmm26 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm25, %k1
+; AVX512DQ-NEXT:    vmovdqa32 960(%rsi), %zmm25 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1024(%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm24, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1088(%rsi), %zmm24 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm23, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1152(%rsi), %zmm23 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm22, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1216(%rsi), %zmm22 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm21, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1280(%rsi), %zmm21 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm20, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1344(%rsi), %zmm20 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm19, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1408(%rsi), %zmm19 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm18, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1472(%rsi), %zmm18 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1536(%rsi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm16, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1600(%rsi), %zmm16 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1664(%rsi), %zmm14 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1728(%rsi), %zmm12 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1792(%rsi), %zmm10 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1856(%rsi), %zmm8 {%k1} {z}
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1920(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-NEXT:    vpmovd2m %zmm31, %k1
+; AVX512DQ-NEXT:    vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
+; AVX512DQ-NEXT:    vmovdqa64 %zmm31, 1984(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 1920(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 1856(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 1792(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 1728(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 1664(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 1600(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 1536(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 1472(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 1408(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 1344(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 1280(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 1216(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 1152(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 1088(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 1024(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 960(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 896(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm27, 832(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 768(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm29, 704(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm30, 640(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 576(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 512(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 448(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 384(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 320(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 256(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 192(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 128(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 64(%rdx)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -29171,43 +13804,152 @@ define void @mask_replication_factor8_vf64(ptr %in.vec, ptr %out.vec) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    kmovq (%rdi), %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT:    vpmovb2m %zmm3, %k0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k2
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k3
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm7
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm12
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm17
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm14
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm9
 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k4
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k5
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k6
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovb2m %zmm0, %k7
-; AVX512BW-NEXT:    kmovq %k7, 48(%rsi)
-; AVX512BW-NEXT:    kmovq %k6, 56(%rsi)
-; AVX512BW-NEXT:    kmovq %k5, 32(%rsi)
-; AVX512BW-NEXT:    kmovq %k4, 40(%rsi)
-; AVX512BW-NEXT:    kmovq %k3, 16(%rsi)
-; AVX512BW-NEXT:    kmovq %k2, 24(%rsi)
-; AVX512BW-NEXT:    kmovq %k1, 8(%rsi)
-; AVX512BW-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm1 {%k2} {z}
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k2
+; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm4 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm8 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm10 {%k2} {z}
+; AVX512BW-NEXT:    vpmovb2m %zmm9, %k2
+; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm9 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm13 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm15 {%k1} {z}
+; AVX512BW-NEXT:    vpmovb2m %zmm14, %k1
+; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm14 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm16 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm18 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 1088(%rsi), %zmm19 {%k2} {z}
+; AVX512BW-NEXT:    vpmovb2m %zmm17, %k2
+; AVX512BW-NEXT:    vmovdqa32 1024(%rsi), %zmm17 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1152(%rsi), %zmm20 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1216(%rsi), %zmm21 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 1344(%rsi), %zmm22 {%k1} {z}
+; AVX512BW-NEXT:    vpmovb2m %zmm12, %k1
+; AVX512BW-NEXT:    vmovdqa32 1280(%rsi), %zmm12 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 1408(%rsi), %zmm23 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT:    vmovdqa32 1472(%rsi), %zmm24 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 1600(%rsi), %zmm25 {%k2} {z}
+; AVX512BW-NEXT:    vpmovb2m %zmm7, %k2
+; AVX512BW-NEXT:    vmovdqa32 1536(%rsi), %zmm7 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1664(%rsi), %zmm26 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1728(%rsi), %zmm27 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 1856(%rsi), %zmm28 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa32 1792(%rsi), %zmm29 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 %zmm31, 1984(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, 1920(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm28, 1856(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, 1792(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm27, 1728(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1664(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm25, 1600(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 1536(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm24, 1472(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm23, 1408(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm22, 1344(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm12, 1280(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm21, 1216(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm20, 1152(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1088(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 1024(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 960(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 896(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 832(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm14, 768(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm13, 704(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 576(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, 512(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 448(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 384(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-  %src.vec = load <64 x i1>, ptr %in.vec, align 64
-  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
-  store <512 x i1> %tgt.mask, ptr %out.vec, align 64
+  %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+  %data = call <512 x i32> @llvm.masked.load.v512i32.p0(ptr %in.vec, i32 64, <512 x i1> %tgt.mask, <512 x i32> poison)
+  store <512 x i32> %data, ptr %out.vec, align 64
   ret void
 }
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <20 x i32> @llvm.masked.load.v20i32.p0(ptr, i32, <20 x i1>, <20 x i32>)
+declare <24 x i32> @llvm.masked.load.v24i32.p0(ptr, i32, <24 x i1>, <24 x i32>)
+declare <28 x i32> @llvm.masked.load.v28i32.p0(ptr, i32, <28 x i1>, <28 x i32>)
+declare <32 x i32> @llvm.masked.load.v32i32.p0(ptr, i32, <32 x i1>, <32 x i32>)
+declare <40 x i32> @llvm.masked.load.v40i32.p0(ptr, i32, <40 x i1>, <40 x i32>)
+declare <48 x i32> @llvm.masked.load.v48i32.p0(ptr, i32, <48 x i1>, <48 x i32>)
+declare <56 x i32> @llvm.masked.load.v56i32.p0(ptr, i32, <56 x i1>, <56 x i32>)
+declare <64 x i32> @llvm.masked.load.v64i32.p0(ptr, i32, <64 x i1>, <64 x i32>)
+declare <80 x i32> @llvm.masked.load.v80i32.p0(ptr, i32, <80 x i1>, <80 x i32>)
+declare <96 x i32> @llvm.masked.load.v96i32.p0(ptr, i32, <96 x i1>, <96 x i32>)
+declare <112 x i32> @llvm.masked.load.v112i32.p0(ptr, i32, <112 x i1>, <112 x i32>)
+declare <128 x i32> @llvm.masked.load.v128i32.p0(ptr, i32, <128 x i1>, <128 x i32>)
+declare <160 x i32> @llvm.masked.load.v160i32.p0(ptr, i32, <160 x i1>, <160 x i32>)
+declare <192 x i32> @llvm.masked.load.v192i32.p0(ptr, i32, <192 x i1>, <192 x i32>)
+declare <224 x i32> @llvm.masked.load.v224i32.p0(ptr, i32, <224 x i1>, <224 x i32>)
+declare <256 x i32> @llvm.masked.load.v256i32.p0(ptr, i32, <256 x i1>, <256 x i32>)
+declare <320 x i32> @llvm.masked.load.v320i32.p0(ptr, i32, <320 x i1>, <320 x i32>)
+declare <384 x i32> @llvm.masked.load.v384i32.p0(ptr, i32, <384 x i1>, <384 x i32>)
+declare <448 x i32> @llvm.masked.load.v448i32.p0(ptr, i32, <448 x i1>, <448 x i32>)
+declare <512 x i32> @llvm.masked.load.v512i32.p0(ptr, i32, <512 x i1>, <512 x i32>)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; AVX512: {{.*}}
 ; FALLBACK0: {{.*}}


        


More information about the llvm-commits mailing list