[llvm] 3187976 - [NFC][X86][Codegen] Add codegen test coverage for i1 mask replication (AVX512 only)

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 13 10:21:46 PST 2022


Author: Roman Lebedev
Date: 2022-12-13T21:21:07+03:00
New Revision: 31879765d838c1797e9efd54deb225096ab89f03

URL: https://github.com/llvm/llvm-project/commit/31879765d838c1797e9efd54deb225096ab89f03
DIFF: https://github.com/llvm/llvm-project/commit/31879765d838c1797e9efd54deb225096ab89f03.diff

LOG: [NFC][X86][Codegen] Add codegen test coverage for i1 mask replication (AVX512 only)

Apparently i didn't add it when adding cost model coverage?

Added: 
    llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
new file mode 100644
index 000000000000..1fac1a049095
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -0,0 +1,29220 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-SLOW,FALLBACK0
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-FAST,FALLBACK1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-SLOW,FALLBACK2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-FAST,FALLBACK3
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-SLOW,FALLBACK4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-FAST,FALLBACK5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-SLOW,FALLBACK6
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-FAST,FALLBACK7
+
+define void @mask_replication_factor2_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor2_vf2:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512F-ONLY-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512F-ONLY-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
+; AVX512F-ONLY-NEXT:    movb %al, (%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor2_vf2:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512DQ-NEXT:    vpmovd2m %xmm0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor2_vf2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512BW-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <2 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  store <4 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor2_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-SLOW-LABEL: mask_replication_factor2_vf4:
+; AVX512F-SLOW:       # %bb.0:
+; AVX512F-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512F-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512F-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512F-SLOW-NEXT:    kmovw %k0, %eax
+; AVX512F-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512F-SLOW-NEXT:    vzeroupper
+; AVX512F-SLOW-NEXT:    retq
+;
+; AVX512F-FAST-LABEL: mask_replication_factor2_vf4:
+; AVX512F-FAST:       # %bb.0:
+; AVX512F-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512F-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512F-FAST-NEXT:    kmovw %k0, %eax
+; AVX512F-FAST-NEXT:    movb %al, (%rsi)
+; AVX512F-FAST-NEXT:    vzeroupper
+; AVX512F-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: mask_replication_factor2_vf4:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %ymm0, %k0
+; AVX512DQ-SLOW-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: mask_replication_factor2_vf4:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vpmovd2m %ymm0, %k0
+; AVX512DQ-FAST-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-SLOW-LABEL: mask_replication_factor2_vf4:
+; AVX512BW-SLOW:       # %bb.0:
+; AVX512BW-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512BW-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512BW-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512BW-SLOW-NEXT:    kmovd %k0, %eax
+; AVX512BW-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512BW-SLOW-NEXT:    vzeroupper
+; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: mask_replication_factor2_vf4:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512BW-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX512BW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512BW-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512BW-FAST-NEXT:    kmovd %k0, %eax
+; AVX512BW-FAST-NEXT:    movb %al, (%rsi)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
+;
+; AVX512VBMI-SLOW-LABEL: mask_replication_factor2_vf4:
+; AVX512VBMI-SLOW:       # %bb.0:
+; AVX512VBMI-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512VBMI-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VBMI-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VBMI-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VBMI-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512VBMI-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512VBMI-SLOW-NEXT:    kmovd %k0, %eax
+; AVX512VBMI-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512VBMI-SLOW-NEXT:    vzeroupper
+; AVX512VBMI-SLOW-NEXT:    retq
+;
+; AVX512VBMI-FAST-LABEL: mask_replication_factor2_vf4:
+; AVX512VBMI-FAST:       # %bb.0:
+; AVX512VBMI-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512VBMI-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VBMI-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VBMI-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX512VBMI-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VBMI-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512VBMI-FAST-NEXT:    kmovd %k0, %eax
+; AVX512VBMI-FAST-NEXT:    movb %al, (%rsi)
+; AVX512VBMI-FAST-NEXT:    vzeroupper
+; AVX512VBMI-FAST-NEXT:    retq
+  %src.vec = load <4 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  store <8 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor2_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor2_vf8:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor2_vf8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor2_vf8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kmovw %k0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %src.vec = load <8 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  store <16 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor2_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor2_vf16:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor2_vf16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor2_vf16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %src.vec = load <16 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  store <32 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor2_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor2_vf32:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k2
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
+; AVX512F-ONLY-NEXT:    kmovw %k3, 4(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k2, 6(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k1, (%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor2_vf32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k1
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k2
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
+; AVX512DQ-NEXT:    kmovw %k3, 4(%rsi)
+; AVX512DQ-NEXT:    kmovw %k1, 6(%rsi)
+; AVX512DQ-NEXT:    kmovw %k2, (%rsi)
+; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
+; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovd (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.vec = load <32 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+  store <64 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor2_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k3
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k4
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k2
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm3
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k3
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k5
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm3
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k4
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k6
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k7
+; AVX512F-ONLY-NEXT:    kmovw %k7, 12(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k2, 14(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k6, 8(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k4, 10(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k5, 4(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k3, 6(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k1, (%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor2_vf64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k3
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k4
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k2
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    vpmovm2d %k3, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm3
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k3
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k5
+; AVX512DQ-NEXT:    vpmovm2d %k4, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm3
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k4
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k6
+; AVX512DQ-NEXT:    vpmovm2d %k2, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k7
+; AVX512DQ-NEXT:    kmovw %k7, 12(%rsi)
+; AVX512DQ-NEXT:    kmovw %k2, 14(%rsi)
+; AVX512DQ-NEXT:    kmovw %k6, 8(%rsi)
+; AVX512DQ-NEXT:    kmovw %k4, 10(%rsi)
+; AVX512DQ-NEXT:    kmovw %k5, 4(%rsi)
+; AVX512DQ-NEXT:    kmovw %k3, 6(%rsi)
+; AVX512DQ-NEXT:    kmovw %k1, (%rsi)
+; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf64:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,2,3,2,3]
+; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5,6,7,6,7]
+; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512BW-ONLY-NEXT:    kmovq %k1, 8(%rsi)
+; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf64:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512VBMI-ONLY-NEXT:    kmovq %k1, 8(%rsi)
+; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.vec = load <64 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
+  store <128 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor3_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor3_vf2:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
+; AVX512F-ONLY-NEXT:    andb $1, %cl
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rcx), %edx
+; AVX512F-ONLY-NEXT:    orb %cl, %dl
+; AVX512F-ONLY-NEXT:    shlb $2, %cl
+; AVX512F-ONLY-NEXT:    orb %dl, %cl
+; AVX512F-ONLY-NEXT:    movl %eax, %edx
+; AVX512F-ONLY-NEXT:    andb $1, %dl
+; AVX512F-ONLY-NEXT:    leal (,%rdx,8), %edi
+; AVX512F-ONLY-NEXT:    orb %cl, %dil
+; AVX512F-ONLY-NEXT:    shlb $4, %dl
+; AVX512F-ONLY-NEXT:    orb %dil, %dl
+; AVX512F-ONLY-NEXT:    shlb $5, %al
+; AVX512F-ONLY-NEXT:    orb %dl, %al
+; AVX512F-ONLY-NEXT:    andb $63, %al
+; AVX512F-ONLY-NEXT:    movb %al, (%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor3_vf2:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    andb $1, %cl
+; AVX512DQ-NEXT:    leal (%rcx,%rcx), %edx
+; AVX512DQ-NEXT:    orb %cl, %dl
+; AVX512DQ-NEXT:    shlb $2, %cl
+; AVX512DQ-NEXT:    orb %dl, %cl
+; AVX512DQ-NEXT:    movl %eax, %edx
+; AVX512DQ-NEXT:    andb $1, %dl
+; AVX512DQ-NEXT:    leal (,%rdx,8), %edi
+; AVX512DQ-NEXT:    orb %cl, %dil
+; AVX512DQ-NEXT:    shlb $4, %dl
+; AVX512DQ-NEXT:    orb %dil, %dl
+; AVX512DQ-NEXT:    shlb $5, %al
+; AVX512DQ-NEXT:    orb %dl, %al
+; AVX512DQ-NEXT:    andb $63, %al
+; AVX512DQ-NEXT:    movb %al, (%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor3_vf2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    andb $1, %cl
+; AVX512BW-NEXT:    leal (%rcx,%rcx), %edx
+; AVX512BW-NEXT:    orb %cl, %dl
+; AVX512BW-NEXT:    shlb $2, %cl
+; AVX512BW-NEXT:    orb %dl, %cl
+; AVX512BW-NEXT:    movl %eax, %edx
+; AVX512BW-NEXT:    andb $1, %dl
+; AVX512BW-NEXT:    leal (,%rdx,8), %edi
+; AVX512BW-NEXT:    orb %cl, %dil
+; AVX512BW-NEXT:    shlb $4, %dl
+; AVX512BW-NEXT:    orb %dil, %dl
+; AVX512BW-NEXT:    shlb $5, %al
+; AVX512BW-NEXT:    orb %dl, %al
+; AVX512BW-NEXT:    andb $63, %al
+; AVX512BW-NEXT:    movb %al, (%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <2 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
+  store <6 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor3_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor3_vf4:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k2
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512F-ONLY-NEXT:    movl %eax, %edx
+; AVX512F-ONLY-NEXT:    shll $4, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $5, %eax
+; AVX512F-ONLY-NEXT:    orl %edx, %eax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $6, %edx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edi
+; AVX512F-ONLY-NEXT:    shll $7, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $8, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    kmovw %k0, %edx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $9, %r8d
+; AVX512F-ONLY-NEXT:    orl %ecx, %r8d
+; AVX512F-ONLY-NEXT:    shll $10, %edi
+; AVX512F-ONLY-NEXT:    orl %r8d, %edi
+; AVX512F-ONLY-NEXT:    shll $11, %edx
+; AVX512F-ONLY-NEXT:    orl %edi, %edx
+; AVX512F-ONLY-NEXT:    orl %eax, %edx
+; AVX512F-ONLY-NEXT:    andl $4095, %edx # imm = 0xFFF
+; AVX512F-ONLY-NEXT:    movw %dx, (%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor3_vf4:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k2
+; AVX512DQ-NEXT:    kshiftrb $3, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrb $2, %k2, %k1
+; AVX512DQ-NEXT:    kshiftrb $1, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512DQ-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512DQ-NEXT:    movl %eax, %edx
+; AVX512DQ-NEXT:    shll $4, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    shll $5, %eax
+; AVX512DQ-NEXT:    orl %edx, %eax
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    shll $6, %edx
+; AVX512DQ-NEXT:    movl %ecx, %edi
+; AVX512DQ-NEXT:    shll $7, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    shll $8, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    kmovw %k0, %edx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movl %edi, %r8d
+; AVX512DQ-NEXT:    shll $9, %r8d
+; AVX512DQ-NEXT:    orl %ecx, %r8d
+; AVX512DQ-NEXT:    shll $10, %edi
+; AVX512DQ-NEXT:    orl %r8d, %edi
+; AVX512DQ-NEXT:    shll $11, %edx
+; AVX512DQ-NEXT:    orl %edi, %edx
+; AVX512DQ-NEXT:    orl %eax, %edx
+; AVX512DQ-NEXT:    andl $4095, %edx # imm = 0xFFF
+; AVX512DQ-NEXT:    movw %dx, (%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor3_vf4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k2, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k2, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512BW-NEXT:    kmovd %k2, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512BW-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512BW-NEXT:    kmovd %k3, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512BW-NEXT:    movl %eax, %edx
+; AVX512BW-NEXT:    shll $4, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    shll $5, %eax
+; AVX512BW-NEXT:    orl %edx, %eax
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    shll $6, %edx
+; AVX512BW-NEXT:    movl %ecx, %edi
+; AVX512BW-NEXT:    shll $7, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    shll $8, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movl %edi, %r8d
+; AVX512BW-NEXT:    shll $9, %r8d
+; AVX512BW-NEXT:    orl %ecx, %r8d
+; AVX512BW-NEXT:    shll $10, %edi
+; AVX512BW-NEXT:    orl %r8d, %edi
+; AVX512BW-NEXT:    shll $11, %edx
+; AVX512BW-NEXT:    orl %edi, %edx
+; AVX512BW-NEXT:    orl %eax, %edx
+; AVX512BW-NEXT:    andl $4095, %edx # imm = 0xFFF
+; AVX512BW-NEXT:    movw %dx, (%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <4 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+  store <12 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor3_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor3_vf8:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k6
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k6, %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k6, %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k6, %k2
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k6, %k3
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k6, %k4
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k6, %k5
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k6, %k7
+; AVX512F-ONLY-NEXT:    kmovw %k6, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512F-ONLY-NEXT:    kmovw %k7, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512F-ONLY-NEXT:    movl %eax, %edx
+; AVX512F-ONLY-NEXT:    shll $4, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $5, %eax
+; AVX512F-ONLY-NEXT:    orl %edx, %eax
+; AVX512F-ONLY-NEXT:    kmovw %k5, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $6, %edx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edi
+; AVX512F-ONLY-NEXT:    shll $7, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $8, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    kmovw %k4, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $9, %edi
+; AVX512F-ONLY-NEXT:    orl %ecx, %edi
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $10, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    shll $11, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    kmovw %k3, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movl %edi, %ecx
+; AVX512F-ONLY-NEXT:    shll $12, %ecx
+; AVX512F-ONLY-NEXT:    orl %edx, %ecx
+; AVX512F-ONLY-NEXT:    movl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $13, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $14, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $15, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    orl %eax, %ecx
+; AVX512F-ONLY-NEXT:    movw %cx, (%rsi)
+; AVX512F-ONLY-NEXT:    movl %edx, %eax
+; AVX512F-ONLY-NEXT:    shll $16, %eax
+; AVX512F-ONLY-NEXT:    shll $17, %edx
+; AVX512F-ONLY-NEXT:    orl %eax, %edx
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, %edi
+; AVX512F-ONLY-NEXT:    shll $18, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    movl %eax, %edx
+; AVX512F-ONLY-NEXT:    shll $19, %edx
+; AVX512F-ONLY-NEXT:    orl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $20, %eax
+; AVX512F-ONLY-NEXT:    orl %edx, %eax
+; AVX512F-ONLY-NEXT:    kmovw %k0, %edx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $21, %r8d
+; AVX512F-ONLY-NEXT:    orl %eax, %r8d
+; AVX512F-ONLY-NEXT:    shll $22, %edi
+; AVX512F-ONLY-NEXT:    orl %r8d, %edi
+; AVX512F-ONLY-NEXT:    shll $23, %edx
+; AVX512F-ONLY-NEXT:    orl %edi, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shrl $16, %edx
+; AVX512F-ONLY-NEXT:    movb %dl, 2(%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor3_vf8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k6
+; AVX512DQ-NEXT:    kshiftrb $7, %k6, %k0
+; AVX512DQ-NEXT:    kshiftrb $6, %k6, %k1
+; AVX512DQ-NEXT:    kshiftrb $5, %k6, %k2
+; AVX512DQ-NEXT:    kshiftrb $4, %k6, %k3
+; AVX512DQ-NEXT:    kshiftrb $3, %k6, %k4
+; AVX512DQ-NEXT:    kshiftrb $2, %k6, %k5
+; AVX512DQ-NEXT:    kshiftrb $1, %k6, %k7
+; AVX512DQ-NEXT:    kmovw %k6, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512DQ-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512DQ-NEXT:    kmovw %k7, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512DQ-NEXT:    movl %eax, %edx
+; AVX512DQ-NEXT:    shll $4, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    shll $5, %eax
+; AVX512DQ-NEXT:    orl %edx, %eax
+; AVX512DQ-NEXT:    kmovw %k5, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    shll $6, %edx
+; AVX512DQ-NEXT:    movl %ecx, %edi
+; AVX512DQ-NEXT:    shll $7, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    shll $8, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    kmovw %k4, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $9, %edi
+; AVX512DQ-NEXT:    orl %ecx, %edi
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $10, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    shll $11, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    kmovw %k3, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movl %edi, %ecx
+; AVX512DQ-NEXT:    shll $12, %ecx
+; AVX512DQ-NEXT:    orl %edx, %ecx
+; AVX512DQ-NEXT:    movl %edi, %edx
+; AVX512DQ-NEXT:    shll $13, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    shll $14, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    kmovw %k2, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $15, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    orl %eax, %ecx
+; AVX512DQ-NEXT:    movw %cx, (%rsi)
+; AVX512DQ-NEXT:    movl %edx, %eax
+; AVX512DQ-NEXT:    shll $16, %eax
+; AVX512DQ-NEXT:    shll $17, %edx
+; AVX512DQ-NEXT:    orl %eax, %edx
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movl %eax, %edi
+; AVX512DQ-NEXT:    shll $18, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    movl %eax, %edx
+; AVX512DQ-NEXT:    shll $19, %edx
+; AVX512DQ-NEXT:    orl %edi, %edx
+; AVX512DQ-NEXT:    shll $20, %eax
+; AVX512DQ-NEXT:    orl %edx, %eax
+; AVX512DQ-NEXT:    kmovw %k0, %edx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movl %edi, %r8d
+; AVX512DQ-NEXT:    shll $21, %r8d
+; AVX512DQ-NEXT:    orl %eax, %r8d
+; AVX512DQ-NEXT:    shll $22, %edi
+; AVX512DQ-NEXT:    orl %r8d, %edi
+; AVX512DQ-NEXT:    shll $23, %edx
+; AVX512DQ-NEXT:    orl %edi, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    shrl $16, %edx
+; AVX512DQ-NEXT:    movb %dl, 2(%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor3_vf8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k6
+; AVX512BW-NEXT:    kshiftrw $7, %k6, %k0
+; AVX512BW-NEXT:    kshiftrw $6, %k6, %k1
+; AVX512BW-NEXT:    kshiftrw $5, %k6, %k2
+; AVX512BW-NEXT:    kshiftrw $4, %k6, %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k6, %k4
+; AVX512BW-NEXT:    kshiftrw $2, %k6, %k5
+; AVX512BW-NEXT:    kshiftrw $1, %k6, %k7
+; AVX512BW-NEXT:    kmovd %k6, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512BW-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512BW-NEXT:    kmovd %k7, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512BW-NEXT:    movl %eax, %edx
+; AVX512BW-NEXT:    shll $4, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    shll $5, %eax
+; AVX512BW-NEXT:    orl %edx, %eax
+; AVX512BW-NEXT:    kmovd %k5, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    shll $6, %edx
+; AVX512BW-NEXT:    movl %ecx, %edi
+; AVX512BW-NEXT:    shll $7, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    shll $8, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    kmovd %k4, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $9, %edi
+; AVX512BW-NEXT:    orl %ecx, %edi
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $10, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    shll $11, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    kmovd %k3, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movl %edi, %ecx
+; AVX512BW-NEXT:    shll $12, %ecx
+; AVX512BW-NEXT:    orl %edx, %ecx
+; AVX512BW-NEXT:    movl %edi, %edx
+; AVX512BW-NEXT:    shll $13, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    shll $14, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    kmovd %k2, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $15, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    orl %eax, %ecx
+; AVX512BW-NEXT:    movw %cx, (%rsi)
+; AVX512BW-NEXT:    movl %edx, %eax
+; AVX512BW-NEXT:    shll $16, %eax
+; AVX512BW-NEXT:    shll $17, %edx
+; AVX512BW-NEXT:    orl %eax, %edx
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movl %eax, %edi
+; AVX512BW-NEXT:    shll $18, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    movl %eax, %edx
+; AVX512BW-NEXT:    shll $19, %edx
+; AVX512BW-NEXT:    orl %edi, %edx
+; AVX512BW-NEXT:    shll $20, %eax
+; AVX512BW-NEXT:    orl %edx, %eax
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movl %edi, %r8d
+; AVX512BW-NEXT:    shll $21, %r8d
+; AVX512BW-NEXT:    orl %eax, %r8d
+; AVX512BW-NEXT:    shll $22, %edi
+; AVX512BW-NEXT:    orl %r8d, %edi
+; AVX512BW-NEXT:    shll $23, %edx
+; AVX512BW-NEXT:    orl %edi, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    shrl $16, %edx
+; AVX512BW-NEXT:    movb %dl, 2(%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <8 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+  store <24 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor3_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor3_vf16:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k4
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k4, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k4, %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k4, %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k4, %k2
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k4, %k3
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %edi
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r15,2), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%r15,4), %r15
+; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rdi,8), %r15
+; AVX512F-ONLY-NEXT:    movq %rdi, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $5, %rdi
+; AVX512F-ONLY-NEXT:    orq %r12, %rdi
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %r15
+; AVX512F-ONLY-NEXT:    shlq $6, %r15
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $8, %r14
+; AVX512F-ONLY-NEXT:    orq %r12, %r14
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $9, %r12
+; AVX512F-ONLY-NEXT:    orq %r14, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r14
+; AVX512F-ONLY-NEXT:    shlq $10, %r14
+; AVX512F-ONLY-NEXT:    orq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $11, %r15
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    movzbl %bl, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %r14
+; AVX512F-ONLY-NEXT:    shlq $12, %r14
+; AVX512F-ONLY-NEXT:    orq %r15, %r14
+; AVX512F-ONLY-NEXT:    movq %rbx, %r15
+; AVX512F-ONLY-NEXT:    shlq $13, %r15
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    shlq $14, %rbx
+; AVX512F-ONLY-NEXT:    orq %r15, %rbx
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %r11d
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    movq %r11, %r14
+; AVX512F-ONLY-NEXT:    shlq $15, %r14
+; AVX512F-ONLY-NEXT:    orq %rbx, %r14
+; AVX512F-ONLY-NEXT:    movq %r11, %rbx
+; AVX512F-ONLY-NEXT:    shlq $16, %rbx
+; AVX512F-ONLY-NEXT:    orq %r14, %rbx
+; AVX512F-ONLY-NEXT:    shlq $17, %r11
+; AVX512F-ONLY-NEXT:    orq %rbx, %r11
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rbx
+; AVX512F-ONLY-NEXT:    shlq $18, %rbx
+; AVX512F-ONLY-NEXT:    orq %r11, %rbx
+; AVX512F-ONLY-NEXT:    movq %r10, %r11
+; AVX512F-ONLY-NEXT:    shlq $19, %r11
+; AVX512F-ONLY-NEXT:    orq %rbx, %r11
+; AVX512F-ONLY-NEXT:    shlq $20, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $21, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $22, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    shlq $23, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %r10
+; AVX512F-ONLY-NEXT:    shlq $24, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $25, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    shlq $26, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    movzbl %dl, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %rdx
+; AVX512F-ONLY-NEXT:    shlq $27, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    movq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $28, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $29, %r9
+; AVX512F-ONLY-NEXT:    orq %r8, %r9
+; AVX512F-ONLY-NEXT:    movzbl %cl, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $30, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $31, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movl %edx, (%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    shlq $32, %rcx
+; AVX512F-ONLY-NEXT:    kmovw %k3, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $33, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    shlq $34, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    shlq $35, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $36, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $37, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    shlq $38, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $39, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    shlq $40, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    shlq $41, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $42, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $43, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    shlq $44, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    movzbl %al, %edi
+; AVX512F-ONLY-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $45, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shrq $32, %rdi
+; AVX512F-ONLY-NEXT:    movw %di, 4(%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor3_vf16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k4
+; AVX512DQ-NEXT:    kshiftrw $15, %k4, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    kshiftrw $14, %k4, %k0
+; AVX512DQ-NEXT:    kshiftrw $13, %k4, %k1
+; AVX512DQ-NEXT:    kshiftrw $12, %k4, %k2
+; AVX512DQ-NEXT:    kshiftrw $11, %k4, %k3
+; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %ecx
+; AVX512DQ-NEXT:    kshiftrw $9, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %edx
+; AVX512DQ-NEXT:    kshiftrw $8, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %r8d
+; AVX512DQ-NEXT:    kshiftrw $7, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %r9d
+; AVX512DQ-NEXT:    kshiftrw $6, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %r10d
+; AVX512DQ-NEXT:    kshiftrw $5, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %r11d
+; AVX512DQ-NEXT:    kshiftrw $4, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %ebx
+; AVX512DQ-NEXT:    kshiftrw $3, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %ebp
+; AVX512DQ-NEXT:    kshiftrw $2, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %r14d
+; AVX512DQ-NEXT:    kshiftrw $1, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %edi
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    movzbl %r15b, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    leaq (%r15,%r15,2), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%r15,4), %r15
+; AVX512DQ-NEXT:    movzbl %dil, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    leaq (%r15,%rdi,8), %r15
+; AVX512DQ-NEXT:    movq %rdi, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    shlq $5, %rdi
+; AVX512DQ-NEXT:    orq %r12, %rdi
+; AVX512DQ-NEXT:    movzbl %r14b, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %r15
+; AVX512DQ-NEXT:    shlq $6, %r15
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $7, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    shlq $8, %r14
+; AVX512DQ-NEXT:    orq %r12, %r14
+; AVX512DQ-NEXT:    movzbl %bpl, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $9, %r12
+; AVX512DQ-NEXT:    orq %r14, %r12
+; AVX512DQ-NEXT:    movq %r15, %r14
+; AVX512DQ-NEXT:    shlq $10, %r14
+; AVX512DQ-NEXT:    orq %r12, %r14
+; AVX512DQ-NEXT:    shlq $11, %r15
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    movzbl %bl, %ebx
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %r14
+; AVX512DQ-NEXT:    shlq $12, %r14
+; AVX512DQ-NEXT:    orq %r15, %r14
+; AVX512DQ-NEXT:    movq %rbx, %r15
+; AVX512DQ-NEXT:    shlq $13, %r15
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    shlq $14, %rbx
+; AVX512DQ-NEXT:    orq %r15, %rbx
+; AVX512DQ-NEXT:    movzbl %r11b, %r11d
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    movq %r11, %r14
+; AVX512DQ-NEXT:    shlq $15, %r14
+; AVX512DQ-NEXT:    orq %rbx, %r14
+; AVX512DQ-NEXT:    movq %r11, %rbx
+; AVX512DQ-NEXT:    shlq $16, %rbx
+; AVX512DQ-NEXT:    orq %r14, %rbx
+; AVX512DQ-NEXT:    shlq $17, %r11
+; AVX512DQ-NEXT:    orq %rbx, %r11
+; AVX512DQ-NEXT:    movzbl %r10b, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rbx
+; AVX512DQ-NEXT:    shlq $18, %rbx
+; AVX512DQ-NEXT:    orq %r11, %rbx
+; AVX512DQ-NEXT:    movq %r10, %r11
+; AVX512DQ-NEXT:    shlq $19, %r11
+; AVX512DQ-NEXT:    orq %rbx, %r11
+; AVX512DQ-NEXT:    shlq $20, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    movzbl %r9b, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $21, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $22, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    shlq $23, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movzbl %r8b, %r8d
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %r10
+; AVX512DQ-NEXT:    shlq $24, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $25, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    shlq $26, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    movzbl %dl, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %rdx
+; AVX512DQ-NEXT:    shlq $27, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    movq %r9, %r8
+; AVX512DQ-NEXT:    shlq $28, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $29, %r9
+; AVX512DQ-NEXT:    orq %r8, %r9
+; AVX512DQ-NEXT:    movzbl %cl, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $30, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    shlq $31, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movl %edx, (%rsi)
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    shlq $32, %rcx
+; AVX512DQ-NEXT:    kmovw %k3, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $33, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rcx
+; AVX512DQ-NEXT:    shlq $34, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    shlq $35, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    kmovw %k2, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $36, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $37, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    shlq $38, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    kmovw %k1, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $39, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rcx
+; AVX512DQ-NEXT:    shlq $40, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    shlq $41, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $42, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $43, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    shlq $44, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    movzbl %al, %edi
+; AVX512DQ-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $45, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    shlq $47, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    shrq $32, %rdi
+; AVX512DQ-NEXT:    movw %di, 4(%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor3_vf16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovw (%rdi), %k4
+; AVX512BW-NEXT:    kshiftrw $15, %k4, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    kshiftrw $14, %k4, %k0
+; AVX512BW-NEXT:    kshiftrw $13, %k4, %k1
+; AVX512BW-NEXT:    kshiftrw $12, %k4, %k2
+; AVX512BW-NEXT:    kshiftrw $11, %k4, %k3
+; AVX512BW-NEXT:    kshiftrw $10, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %ecx
+; AVX512BW-NEXT:    kshiftrw $9, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %edx
+; AVX512BW-NEXT:    kshiftrw $8, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %r8d
+; AVX512BW-NEXT:    kshiftrw $7, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %r9d
+; AVX512BW-NEXT:    kshiftrw $6, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %r10d
+; AVX512BW-NEXT:    kshiftrw $5, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %r11d
+; AVX512BW-NEXT:    kshiftrw $4, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %ebx
+; AVX512BW-NEXT:    kshiftrw $3, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %ebp
+; AVX512BW-NEXT:    kshiftrw $2, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %r14d
+; AVX512BW-NEXT:    kshiftrw $1, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %edi
+; AVX512BW-NEXT:    kmovd %k4, %r15d
+; AVX512BW-NEXT:    movzbl %r15b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    leaq (%r15,%r15,2), %r12
+; AVX512BW-NEXT:    leaq (%r12,%r15,4), %r15
+; AVX512BW-NEXT:    movzbl %dil, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    leaq (%r15,%rdi,8), %r15
+; AVX512BW-NEXT:    movq %rdi, %r12
+; AVX512BW-NEXT:    shlq $4, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    shlq $5, %rdi
+; AVX512BW-NEXT:    orq %r12, %rdi
+; AVX512BW-NEXT:    movzbl %r14b, %r14d
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %r15
+; AVX512BW-NEXT:    shlq $6, %r15
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    shlq $8, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    movzbl %bpl, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    movq %r15, %r14
+; AVX512BW-NEXT:    shlq $10, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    shlq $11, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    movzbl %bl, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %r14
+; AVX512BW-NEXT:    shlq $12, %r14
+; AVX512BW-NEXT:    orq %r15, %r14
+; AVX512BW-NEXT:    movq %rbx, %r15
+; AVX512BW-NEXT:    shlq $13, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    shlq $14, %rbx
+; AVX512BW-NEXT:    orq %r15, %rbx
+; AVX512BW-NEXT:    movzbl %r11b, %r11d
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %r14
+; AVX512BW-NEXT:    shlq $15, %r14
+; AVX512BW-NEXT:    orq %rbx, %r14
+; AVX512BW-NEXT:    movq %r11, %rbx
+; AVX512BW-NEXT:    shlq $16, %rbx
+; AVX512BW-NEXT:    orq %r14, %rbx
+; AVX512BW-NEXT:    shlq $17, %r11
+; AVX512BW-NEXT:    orq %rbx, %r11
+; AVX512BW-NEXT:    movzbl %r10b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %rbx
+; AVX512BW-NEXT:    shlq $18, %rbx
+; AVX512BW-NEXT:    orq %r11, %rbx
+; AVX512BW-NEXT:    movq %r10, %r11
+; AVX512BW-NEXT:    shlq $19, %r11
+; AVX512BW-NEXT:    orq %rbx, %r11
+; AVX512BW-NEXT:    shlq $20, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    movzbl %r9b, %r9d
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $21, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $22, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    shlq $23, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movzbl %r8b, %r8d
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %r10
+; AVX512BW-NEXT:    shlq $24, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $25, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    shlq $26, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    movzbl %dl, %r9d
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %rdx
+; AVX512BW-NEXT:    shlq $27, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    movq %r9, %r8
+; AVX512BW-NEXT:    shlq $28, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    shlq $29, %r9
+; AVX512BW-NEXT:    orq %r8, %r9
+; AVX512BW-NEXT:    movzbl %cl, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %r8
+; AVX512BW-NEXT:    shlq $30, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    shlq $31, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movl %edx, (%rsi)
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    shlq $32, %rcx
+; AVX512BW-NEXT:    kmovd %k3, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $33, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    movq %rdi, %rcx
+; AVX512BW-NEXT:    shlq $34, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    shlq $35, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    kmovd %k2, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %r8
+; AVX512BW-NEXT:    shlq $36, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $37, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    shlq $38, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $39, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    movq %rdi, %rcx
+; AVX512BW-NEXT:    shlq $40, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    shlq $41, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %r8
+; AVX512BW-NEXT:    shlq $42, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $43, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    shlq $44, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    movzbl %al, %edi
+; AVX512BW-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $45, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    shlq $47, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    shrq $32, %rdi
+; AVX512BW-NEXT:    movw %di, 4(%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <16 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
+  store <48 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor3_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor3_vf32:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r13
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k2
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movl %eax, %ebp
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    leal (%rbp,%rbp,2), %ebp
+; AVX512F-ONLY-NEXT:    leal (%rbp,%r14,4), %ebp
+; AVX512F-ONLY-NEXT:    leal (%rbp,%r14,8), %ebp
+; AVX512F-ONLY-NEXT:    shll $4, %r14d
+; AVX512F-ONLY-NEXT:    orl %ebp, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movl %r15d, %ebp
+; AVX512F-ONLY-NEXT:    shll $5, %ebp
+; AVX512F-ONLY-NEXT:    orl %r14d, %ebp
+; AVX512F-ONLY-NEXT:    movl %r15d, %r14d
+; AVX512F-ONLY-NEXT:    shll $6, %r14d
+; AVX512F-ONLY-NEXT:    shll $7, %r15d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movl %r12d, %r14d
+; AVX512F-ONLY-NEXT:    shll $8, %r14d
+; AVX512F-ONLY-NEXT:    orl %r15d, %r14d
+; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
+; AVX512F-ONLY-NEXT:    shll $9, %r15d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r15d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k2
+; AVX512F-ONLY-NEXT:    shll $10, %r12d
+; AVX512F-ONLY-NEXT:    orl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movl %r13d, %r14d
+; AVX512F-ONLY-NEXT:    shll $11, %r14d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
+; AVX512F-ONLY-NEXT:    movl %r13d, %r15d
+; AVX512F-ONLY-NEXT:    shll $12, %r15d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r15d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k2
+; AVX512F-ONLY-NEXT:    shll $13, %r13d
+; AVX512F-ONLY-NEXT:    orl %r15d, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, %r14d
+; AVX512F-ONLY-NEXT:    shll $14, %r14d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
+; AVX512F-ONLY-NEXT:    movl %eax, %r15d
+; AVX512F-ONLY-NEXT:    shll $15, %r15d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r15d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k2
+; AVX512F-ONLY-NEXT:    shll $16, %eax
+; AVX512F-ONLY-NEXT:    orl %r15d, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
+; AVX512F-ONLY-NEXT:    shll $17, %r15d
+; AVX512F-ONLY-NEXT:    orl %eax, %r15d
+; AVX512F-ONLY-NEXT:    movl %r12d, %eax
+; AVX512F-ONLY-NEXT:    shll $18, %eax
+; AVX512F-ONLY-NEXT:    orl %r15d, %eax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k2
+; AVX512F-ONLY-NEXT:    shll $19, %r12d
+; AVX512F-ONLY-NEXT:    orl %eax, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movl %r14d, %eax
+; AVX512F-ONLY-NEXT:    shll $20, %eax
+; AVX512F-ONLY-NEXT:    orl %r12d, %eax
+; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
+; AVX512F-ONLY-NEXT:    shll $21, %r12d
+; AVX512F-ONLY-NEXT:    orl %eax, %r12d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k2
+; AVX512F-ONLY-NEXT:    shll $22, %r14d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    shll $23, %r12d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r12d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
+; AVX512F-ONLY-NEXT:    shll $24, %r13d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512F-ONLY-NEXT:    shll $25, %r15d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, %r12d
+; AVX512F-ONLY-NEXT:    shll $26, %r12d
+; AVX512F-ONLY-NEXT:    orl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    movl %eax, %r15d
+; AVX512F-ONLY-NEXT:    shll $27, %r15d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r15d
+; AVX512F-ONLY-NEXT:    shll $28, %eax
+; AVX512F-ONLY-NEXT:    orl %r15d, %eax
+; AVX512F-ONLY-NEXT:    movl %r14d, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movl %r13d, %r12d
+; AVX512F-ONLY-NEXT:    shll $29, %r12d
+; AVX512F-ONLY-NEXT:    orl %eax, %r12d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512F-ONLY-NEXT:    shll $30, %r13d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k2
+; AVX512F-ONLY-NEXT:    shll $31, %r14d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512F-ONLY-NEXT:    orl %ebp, %r14d
+; AVX512F-ONLY-NEXT:    movl %r14d, 8(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    movzbl %al, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r14
+; AVX512F-ONLY-NEXT:    leaq (%r14,%rax,4), %rax
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r14,8), %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $5, %r14
+; AVX512F-ONLY-NEXT:    orq %r12, %r14
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $6, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %bl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rbx
+; AVX512F-ONLY-NEXT:    shlq $9, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $11, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r11
+; AVX512F-ONLY-NEXT:    shlq $12, %r11
+; AVX512F-ONLY-NEXT:    orq %r12, %r11
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $13, %r12
+; AVX512F-ONLY-NEXT:    orq %r11, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %ebp
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movq %rbp, %r10
+; AVX512F-ONLY-NEXT:    shlq $15, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $17, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %rbp, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $19, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $20, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $21, %r8
+; AVX512F-ONLY-NEXT:    orq %r12, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $22, %r12
+; AVX512F-ONLY-NEXT:    orq %r8, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %dil, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rdi
+; AVX512F-ONLY-NEXT:    shlq $24, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $25, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
+; AVX512F-ONLY-NEXT:    shlq $26, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %dl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $27, %rdx
+; AVX512F-ONLY-NEXT:    orq %r12, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $28, %r12
+; AVX512F-ONLY-NEXT:    orq %rdx, %r12
+; AVX512F-ONLY-NEXT:    shlq $29, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %cl, %ebp
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movq %rbp, %rdx
+; AVX512F-ONLY-NEXT:    shlq $30, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512F-ONLY-NEXT:    shlq $31, %rbp
+; AVX512F-ONLY-NEXT:    orq %rdx, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k0, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    shlq $32, %rcx
+; AVX512F-ONLY-NEXT:    orq %rbp, %rcx
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $34, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $35, %r13
+; AVX512F-ONLY-NEXT:    orq %rcx, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $37, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $38, %r15
+; AVX512F-ONLY-NEXT:    orq %rcx, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $39, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $40, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $41, %rbx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rbx
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %rcx
+; AVX512F-ONLY-NEXT:    shlq $43, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $44, %r11
+; AVX512F-ONLY-NEXT:    orq %rcx, %r11
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $45, %rax
+; AVX512F-ONLY-NEXT:    orq %r11, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %rcx
+; AVX512F-ONLY-NEXT:    shlq $46, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $47, %r10
+; AVX512F-ONLY-NEXT:    orq %rcx, %r10
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $49, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $50, %r12
+; AVX512F-ONLY-NEXT:    orq %rcx, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $51, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r9, %rcx
+; AVX512F-ONLY-NEXT:    shlq $52, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $53, %r9
+; AVX512F-ONLY-NEXT:    orq %rcx, %r9
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %rcx
+; AVX512F-ONLY-NEXT:    shlq $55, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $56, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    shlq $58, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $59, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $61, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $62, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512F-ONLY-NEXT:    shlq $63, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %rax, (%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r13
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor3_vf32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %ecx
+; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %edx
+; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %edi
+; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r8d
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r9d
+; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r10d
+; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r11d
+; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %ebx
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r13d
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k2
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movl %eax, %ebp
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    leal (%rbp,%rbp,2), %ebp
+; AVX512DQ-NEXT:    leal (%rbp,%r14,4), %ebp
+; AVX512DQ-NEXT:    leal (%rbp,%r14,8), %ebp
+; AVX512DQ-NEXT:    shll $4, %r14d
+; AVX512DQ-NEXT:    orl %ebp, %r14d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movl %r15d, %ebp
+; AVX512DQ-NEXT:    shll $5, %ebp
+; AVX512DQ-NEXT:    orl %r14d, %ebp
+; AVX512DQ-NEXT:    movl %r15d, %r14d
+; AVX512DQ-NEXT:    shll $6, %r14d
+; AVX512DQ-NEXT:    shll $7, %r15d
+; AVX512DQ-NEXT:    orl %r14d, %r15d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movl %r12d, %r14d
+; AVX512DQ-NEXT:    shll $8, %r14d
+; AVX512DQ-NEXT:    orl %r15d, %r14d
+; AVX512DQ-NEXT:    movl %r12d, %r15d
+; AVX512DQ-NEXT:    shll $9, %r15d
+; AVX512DQ-NEXT:    orl %r14d, %r15d
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k2
+; AVX512DQ-NEXT:    shll $10, %r12d
+; AVX512DQ-NEXT:    orl %r15d, %r12d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movl %r13d, %r14d
+; AVX512DQ-NEXT:    shll $11, %r14d
+; AVX512DQ-NEXT:    orl %r12d, %r14d
+; AVX512DQ-NEXT:    movl %r13d, %r15d
+; AVX512DQ-NEXT:    shll $12, %r15d
+; AVX512DQ-NEXT:    orl %r14d, %r15d
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k2
+; AVX512DQ-NEXT:    shll $13, %r13d
+; AVX512DQ-NEXT:    orl %r15d, %r13d
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movl %eax, %r14d
+; AVX512DQ-NEXT:    shll $14, %r14d
+; AVX512DQ-NEXT:    orl %r13d, %r14d
+; AVX512DQ-NEXT:    movl %eax, %r15d
+; AVX512DQ-NEXT:    shll $15, %r15d
+; AVX512DQ-NEXT:    orl %r14d, %r15d
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k2
+; AVX512DQ-NEXT:    shll $16, %eax
+; AVX512DQ-NEXT:    orl %r15d, %eax
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movl %r12d, %r15d
+; AVX512DQ-NEXT:    shll $17, %r15d
+; AVX512DQ-NEXT:    orl %eax, %r15d
+; AVX512DQ-NEXT:    movl %r12d, %eax
+; AVX512DQ-NEXT:    shll $18, %eax
+; AVX512DQ-NEXT:    orl %r15d, %eax
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k2
+; AVX512DQ-NEXT:    shll $19, %r12d
+; AVX512DQ-NEXT:    orl %eax, %r12d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movl %r14d, %eax
+; AVX512DQ-NEXT:    shll $20, %eax
+; AVX512DQ-NEXT:    orl %r12d, %eax
+; AVX512DQ-NEXT:    movl %r14d, %r12d
+; AVX512DQ-NEXT:    shll $21, %r12d
+; AVX512DQ-NEXT:    orl %eax, %r12d
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k2
+; AVX512DQ-NEXT:    shll $22, %r14d
+; AVX512DQ-NEXT:    orl %r12d, %r14d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movl %r15d, %r12d
+; AVX512DQ-NEXT:    shll $23, %r12d
+; AVX512DQ-NEXT:    orl %r14d, %r12d
+; AVX512DQ-NEXT:    movl %r15d, %r13d
+; AVX512DQ-NEXT:    shll $24, %r13d
+; AVX512DQ-NEXT:    orl %r12d, %r13d
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512DQ-NEXT:    shll $25, %r15d
+; AVX512DQ-NEXT:    orl %r13d, %r15d
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movl %eax, %r12d
+; AVX512DQ-NEXT:    shll $26, %r12d
+; AVX512DQ-NEXT:    orl %r15d, %r12d
+; AVX512DQ-NEXT:    movl %eax, %r15d
+; AVX512DQ-NEXT:    shll $27, %r15d
+; AVX512DQ-NEXT:    orl %r12d, %r15d
+; AVX512DQ-NEXT:    shll $28, %eax
+; AVX512DQ-NEXT:    orl %r15d, %eax
+; AVX512DQ-NEXT:    movl %r14d, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movl %r13d, %r12d
+; AVX512DQ-NEXT:    shll $29, %r12d
+; AVX512DQ-NEXT:    orl %eax, %r12d
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512DQ-NEXT:    shll $30, %r13d
+; AVX512DQ-NEXT:    orl %r12d, %r13d
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k2
+; AVX512DQ-NEXT:    shll $31, %r14d
+; AVX512DQ-NEXT:    orl %r13d, %r14d
+; AVX512DQ-NEXT:    kmovw %k2, %r13d
+; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512DQ-NEXT:    orl %ebp, %r14d
+; AVX512DQ-NEXT:    movl %r14d, 8(%rsi)
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    movzbl %al, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r14
+; AVX512DQ-NEXT:    leaq (%r14,%rax,4), %rax
+; AVX512DQ-NEXT:    movzbl %r12b, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    leaq (%rax,%r14,8), %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $5, %r14
+; AVX512DQ-NEXT:    orq %r12, %r14
+; AVX512DQ-NEXT:    movzbl %r15b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $6, %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $7, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k2
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %bl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rbx
+; AVX512DQ-NEXT:    shlq $9, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %ebx
+; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k2
+; AVX512DQ-NEXT:    shlq $11, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %r11b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r11
+; AVX512DQ-NEXT:    shlq $12, %r11
+; AVX512DQ-NEXT:    orq %r12, %r11
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $13, %r12
+; AVX512DQ-NEXT:    orq %r11, %r12
+; AVX512DQ-NEXT:    kmovw %k2, %r11d
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %r10b, %ebp
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movq %rbp, %r10
+; AVX512DQ-NEXT:    shlq $15, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %r10d
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512DQ-NEXT:    shlq $17, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    movzbl %r9b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %rbp, %rax
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $19, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k1, %r9d
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512DQ-NEXT:    shlq $20, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    movzbl %r8b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $21, %r8
+; AVX512DQ-NEXT:    orq %r12, %r8
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $22, %r12
+; AVX512DQ-NEXT:    orq %r8, %r12
+; AVX512DQ-NEXT:    kmovw %k1, %r8d
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %dil, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rdi
+; AVX512DQ-NEXT:    shlq $24, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $25, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %edi
+; AVX512DQ-NEXT:    shlq $26, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %dl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $27, %rdx
+; AVX512DQ-NEXT:    orq %r12, %rdx
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $28, %r12
+; AVX512DQ-NEXT:    orq %rdx, %r12
+; AVX512DQ-NEXT:    shlq $29, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %cl, %ebp
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movq %rbp, %rdx
+; AVX512DQ-NEXT:    shlq $30, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    kmovw %k0, %r12d
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512DQ-NEXT:    shlq $31, %rbp
+; AVX512DQ-NEXT:    orq %rdx, %rbp
+; AVX512DQ-NEXT:    kmovw %k0, %edx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    shlq $32, %rcx
+; AVX512DQ-NEXT:    orq %rbp, %rcx
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $34, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $35, %r13
+; AVX512DQ-NEXT:    orq %rcx, %r13
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $37, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $38, %r15
+; AVX512DQ-NEXT:    orq %rcx, %r15
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $39, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %rbx, %rcx
+; AVX512DQ-NEXT:    shlq $40, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $41, %rbx
+; AVX512DQ-NEXT:    orq %rcx, %rbx
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r11, %rcx
+; AVX512DQ-NEXT:    shlq $43, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $44, %r11
+; AVX512DQ-NEXT:    orq %rcx, %r11
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $45, %rax
+; AVX512DQ-NEXT:    orq %r11, %rax
+; AVX512DQ-NEXT:    movq %r10, %rcx
+; AVX512DQ-NEXT:    shlq $46, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $47, %r10
+; AVX512DQ-NEXT:    orq %rcx, %r10
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $49, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $50, %r12
+; AVX512DQ-NEXT:    orq %rcx, %r12
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $51, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r9, %rcx
+; AVX512DQ-NEXT:    shlq $52, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $53, %r9
+; AVX512DQ-NEXT:    orq %rcx, %r9
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %r8, %rcx
+; AVX512DQ-NEXT:    shlq $55, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $56, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rdi, %rcx
+; AVX512DQ-NEXT:    shlq $58, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $59, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $60, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $61, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $62, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512DQ-NEXT:    shlq $63, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %rax, (%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor3_vf32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r13
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    movq %rsi, %rdx
+; AVX512BW-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrd $21, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %esi
+; AVX512BW-NEXT:    kshiftrd $10, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrd $9, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrd $8, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrd $7, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrd $6, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrd $5, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrd $4, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrd $3, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrd $22, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $23, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $24, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $25, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrd $26, %k0, %k1
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movl %esi, %ebp
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    leal (%rbp,%rbp,2), %ebp
+; AVX512BW-NEXT:    leal (%rbp,%r14,4), %ebp
+; AVX512BW-NEXT:    leal (%rbp,%r14,8), %ebp
+; AVX512BW-NEXT:    shll $4, %r14d
+; AVX512BW-NEXT:    orl %ebp, %r14d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movl %r15d, %ebp
+; AVX512BW-NEXT:    shll $5, %ebp
+; AVX512BW-NEXT:    orl %r14d, %ebp
+; AVX512BW-NEXT:    movl %r15d, %r14d
+; AVX512BW-NEXT:    shll $6, %r14d
+; AVX512BW-NEXT:    shll $7, %r15d
+; AVX512BW-NEXT:    orl %r14d, %r15d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movl %r12d, %r14d
+; AVX512BW-NEXT:    shll $8, %r14d
+; AVX512BW-NEXT:    orl %r15d, %r14d
+; AVX512BW-NEXT:    movl %r12d, %r15d
+; AVX512BW-NEXT:    shll $9, %r15d
+; AVX512BW-NEXT:    orl %r14d, %r15d
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrd $27, %k0, %k1
+; AVX512BW-NEXT:    shll $10, %r12d
+; AVX512BW-NEXT:    orl %r15d, %r12d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movl %r13d, %r14d
+; AVX512BW-NEXT:    shll $11, %r14d
+; AVX512BW-NEXT:    orl %r12d, %r14d
+; AVX512BW-NEXT:    movl %r13d, %r15d
+; AVX512BW-NEXT:    shll $12, %r15d
+; AVX512BW-NEXT:    orl %r14d, %r15d
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $28, %k0, %k1
+; AVX512BW-NEXT:    shll $13, %r13d
+; AVX512BW-NEXT:    orl %r15d, %r13d
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movl %eax, %r14d
+; AVX512BW-NEXT:    shll $14, %r14d
+; AVX512BW-NEXT:    orl %r13d, %r14d
+; AVX512BW-NEXT:    movl %eax, %r15d
+; AVX512BW-NEXT:    shll $15, %r15d
+; AVX512BW-NEXT:    orl %r14d, %r15d
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $29, %k0, %k1
+; AVX512BW-NEXT:    shll $16, %eax
+; AVX512BW-NEXT:    orl %r15d, %eax
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movl %r12d, %r15d
+; AVX512BW-NEXT:    shll $17, %r15d
+; AVX512BW-NEXT:    orl %eax, %r15d
+; AVX512BW-NEXT:    movl %r12d, %eax
+; AVX512BW-NEXT:    shll $18, %eax
+; AVX512BW-NEXT:    orl %r15d, %eax
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $30, %k0, %k1
+; AVX512BW-NEXT:    shll $19, %r12d
+; AVX512BW-NEXT:    orl %eax, %r12d
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movl %r14d, %eax
+; AVX512BW-NEXT:    shll $20, %eax
+; AVX512BW-NEXT:    orl %r12d, %eax
+; AVX512BW-NEXT:    movl %r14d, %r12d
+; AVX512BW-NEXT:    shll $21, %r12d
+; AVX512BW-NEXT:    orl %eax, %r12d
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrd $31, %k0, %k1
+; AVX512BW-NEXT:    shll $22, %r14d
+; AVX512BW-NEXT:    orl %r12d, %r14d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movl %r15d, %r12d
+; AVX512BW-NEXT:    shll $23, %r12d
+; AVX512BW-NEXT:    orl %r14d, %r12d
+; AVX512BW-NEXT:    movl %r15d, %r13d
+; AVX512BW-NEXT:    shll $24, %r13d
+; AVX512BW-NEXT:    orl %r12d, %r13d
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $2, %k0, %k1
+; AVX512BW-NEXT:    shll $25, %r15d
+; AVX512BW-NEXT:    orl %r13d, %r15d
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movl %eax, %r12d
+; AVX512BW-NEXT:    shll $26, %r12d
+; AVX512BW-NEXT:    orl %r15d, %r12d
+; AVX512BW-NEXT:    movl %eax, %r15d
+; AVX512BW-NEXT:    shll $27, %r15d
+; AVX512BW-NEXT:    orl %r12d, %r15d
+; AVX512BW-NEXT:    shll $28, %eax
+; AVX512BW-NEXT:    orl %r15d, %eax
+; AVX512BW-NEXT:    movl %r14d, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movl %r13d, %r12d
+; AVX512BW-NEXT:    shll $29, %r12d
+; AVX512BW-NEXT:    orl %eax, %r12d
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $1, %k0, %k1
+; AVX512BW-NEXT:    shll $30, %r13d
+; AVX512BW-NEXT:    orl %r12d, %r13d
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $11, %k0, %k1
+; AVX512BW-NEXT:    shll $31, %r14d
+; AVX512BW-NEXT:    orl %r13d, %r14d
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrd $12, %k0, %k1
+; AVX512BW-NEXT:    orl %ebp, %r14d
+; AVX512BW-NEXT:    movq %rdx, %rbp
+; AVX512BW-NEXT:    movl %r14d, 8(%rdx)
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movzbl %al, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r14
+; AVX512BW-NEXT:    leaq (%r14,%rax,4), %rax
+; AVX512BW-NEXT:    movzbl %r12b, %r14d
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    leaq (%rax,%r14,8), %rax
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $4, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    shlq $5, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    movzbl %r15b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    shlq $6, %r15
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $13, %k0, %k1
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %bl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rbx
+; AVX512BW-NEXT:    shlq $9, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrd $14, %k0, %k1
+; AVX512BW-NEXT:    shlq $11, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %r11b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r11
+; AVX512BW-NEXT:    shlq $12, %r11
+; AVX512BW-NEXT:    orq %r12, %r11
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $13, %r12
+; AVX512BW-NEXT:    orq %r11, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrd $15, %k0, %k1
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %r10b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $15, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512BW-NEXT:    shlq $17, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %r9b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $18, %r9
+; AVX512BW-NEXT:    orq %r12, %r9
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $19, %r12
+; AVX512BW-NEXT:    orq %r9, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrd $17, %k0, %k1
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %r8b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r8
+; AVX512BW-NEXT:    shlq $21, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrd $18, %k0, %k1
+; AVX512BW-NEXT:    shlq $23, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %dil, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $24, %rdi
+; AVX512BW-NEXT:    orq %r12, %rdi
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $25, %r12
+; AVX512BW-NEXT:    orq %rdi, %r12
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrd $19, %k0, %k1
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %r12
+; AVX512BW-NEXT:    shlq $27, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $28, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    shlq $29, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movzbl %cl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $30, %r12
+; AVX512BW-NEXT:    orq %rdx, %r12
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrd $20, %k0, %k0
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k0, %r12d
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    shlq $32, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $34, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $35, %r13
+; AVX512BW-NEXT:    orq %rcx, %r13
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $37, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $38, %r15
+; AVX512BW-NEXT:    orq %rcx, %r15
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $39, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %rbx, %rcx
+; AVX512BW-NEXT:    shlq $40, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $41, %rbx
+; AVX512BW-NEXT:    orq %rcx, %rbx
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r11, %rcx
+; AVX512BW-NEXT:    shlq $43, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $44, %r11
+; AVX512BW-NEXT:    orq %rcx, %r11
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $45, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r10, %rcx
+; AVX512BW-NEXT:    shlq $46, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $47, %r10
+; AVX512BW-NEXT:    orq %rcx, %r10
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %r9, %rcx
+; AVX512BW-NEXT:    shlq $49, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $50, %r9
+; AVX512BW-NEXT:    orq %rcx, %r9
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $51, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r8, %rcx
+; AVX512BW-NEXT:    shlq $52, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $53, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdi, %rcx
+; AVX512BW-NEXT:    shlq $55, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $56, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $57, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movq %rdx, %rcx
+; AVX512BW-NEXT:    shlq $58, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $59, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $61, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $62, %r12
+; AVX512BW-NEXT:    orq %rcx, %r12
+; AVX512BW-NEXT:    movzbl %sil, %eax
+; AVX512BW-NEXT:    shlq $63, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %rax, (%rbp)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r13
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <32 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
+  store <96 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor3_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor3_vf64:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r13
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k3
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k0
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k2
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k2, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    leaq (%r12,%r12,2), %r13
+; AVX512F-ONLY-NEXT:    leaq (%r13,%r12,4), %r12
+; AVX512F-ONLY-NEXT:    movzbl %bl, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    leaq (%r12,%rbx,8), %r12
+; AVX512F-ONLY-NEXT:    movq %rbx, %r13
+; AVX512F-ONLY-NEXT:    shlq $4, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $5, %rbx
+; AVX512F-ONLY-NEXT:    orq %r13, %rbx
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $6, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %r13
+; AVX512F-ONLY-NEXT:    shlq $7, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $8, %r10
+; AVX512F-ONLY-NEXT:    orq %r13, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $9, %r13
+; AVX512F-ONLY-NEXT:    orq %r10, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $10, %r10
+; AVX512F-ONLY-NEXT:    orq %r13, %r10
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $11, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r13
+; AVX512F-ONLY-NEXT:    shlq $12, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $13, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $14, %r10
+; AVX512F-ONLY-NEXT:    orq %r12, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $15, %r13
+; AVX512F-ONLY-NEXT:    orq %r10, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $16, %r10
+; AVX512F-ONLY-NEXT:    orq %r13, %r10
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $17, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r13
+; AVX512F-ONLY-NEXT:    shlq $18, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $19, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $20, %r10
+; AVX512F-ONLY-NEXT:    orq %r12, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $21, %r13
+; AVX512F-ONLY-NEXT:    orq %r10, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $22, %r10
+; AVX512F-ONLY-NEXT:    orq %r13, %r10
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $23, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r13
+; AVX512F-ONLY-NEXT:    shlq $24, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $25, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k4
+; AVX512F-ONLY-NEXT:    shlq $26, %r10
+; AVX512F-ONLY-NEXT:    orq %r12, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $27, %r13
+; AVX512F-ONLY-NEXT:    orq %r10, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $28, %r10
+; AVX512F-ONLY-NEXT:    orq %r13, %r10
+; AVX512F-ONLY-NEXT:    shlq $29, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $30, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $31, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k3, %k4
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    shlq $32, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $33, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $34, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $36, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $37, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $38, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $39, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $40, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k3, %k3
+; AVX512F-ONLY-NEXT:    shlq $41, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $42, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $43, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k3
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $45, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $48, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $49, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k3
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k3
+; AVX512F-ONLY-NEXT:    shlq $53, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $55, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k3
+; AVX512F-ONLY-NEXT:    shlq $56, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $57, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $58, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k3
+; AVX512F-ONLY-NEXT:    shlq $59, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $60, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $61, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $62, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $63, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k3
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %rax, (%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %ebx
+; AVX512F-ONLY-NEXT:    movl %ebx, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    leaq (%r12,%rax,2), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%rax,4), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%rax,8), %rax
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $4, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $5, %r14
+; AVX512F-ONLY-NEXT:    orq %r13, %r14
+; AVX512F-ONLY-NEXT:    shlq $6, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $7, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $8, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $9, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $10, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $11, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $12, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $13, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $14, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $15, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $16, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $17, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $18, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $19, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $20, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $21, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $22, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $24, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $25, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $26, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $27, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $28, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $29, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $30, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $32, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $33, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $34, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $35, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $37, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $39, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $41, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $42, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $43, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $44, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $45, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $46, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $48, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $50, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $52, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $53, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $55, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $57, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $58, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $59, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $60, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movl %r11d, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $61, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $62, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %r11d
+; AVX512F-ONLY-NEXT:    shlq $63, %r11
+; AVX512F-ONLY-NEXT:    orq %rax, %r11
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512F-ONLY-NEXT:    orq %r14, %r11
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512F-ONLY-NEXT:    movq %r11, 16(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r15,2), %r11
+; AVX512F-ONLY-NEXT:    leaq (%r11,%r14,4), %r11
+; AVX512F-ONLY-NEXT:    leaq (%r11,%r14,8), %r11
+; AVX512F-ONLY-NEXT:    shlq $4, %r14
+; AVX512F-ONLY-NEXT:    orq %r11, %r14
+; AVX512F-ONLY-NEXT:    movzbl %al, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r11
+; AVX512F-ONLY-NEXT:    shlq $5, %r11
+; AVX512F-ONLY-NEXT:    orq %r14, %r11
+; AVX512F-ONLY-NEXT:    movq %rax, %r14
+; AVX512F-ONLY-NEXT:    shlq $6, %r14
+; AVX512F-ONLY-NEXT:    shlq $7, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %r13
+; AVX512F-ONLY-NEXT:    shlq $8, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r14, %r15
+; AVX512F-ONLY-NEXT:    shlq $9, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $10, %r14
+; AVX512F-ONLY-NEXT:    orq %r15, %r14
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $11, %r15
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $12, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $13, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $14, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $15, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $16, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $17, %r9
+; AVX512F-ONLY-NEXT:    orq %r12, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $18, %r12
+; AVX512F-ONLY-NEXT:    orq %r9, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $19, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r8
+; AVX512F-ONLY-NEXT:    shlq $20, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $21, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $22, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %dil, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $23, %rdi
+; AVX512F-ONLY-NEXT:    orq %r12, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $24, %r12
+; AVX512F-ONLY-NEXT:    orq %rdi, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $25, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %dl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rdx
+; AVX512F-ONLY-NEXT:    shlq $26, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $27, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
+; AVX512F-ONLY-NEXT:    shlq $28, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %cl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $29, %rcx
+; AVX512F-ONLY-NEXT:    orq %r12, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $30, %r12
+; AVX512F-ONLY-NEXT:    orq %rcx, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movq %rbp, %rcx
+; AVX512F-ONLY-NEXT:    shlq $32, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
+; AVX512F-ONLY-NEXT:    shlq $34, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %rbp, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $36, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $37, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $39, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $40, %r14
+; AVX512F-ONLY-NEXT:    orq %r12, %r14
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $41, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r14
+; AVX512F-ONLY-NEXT:    shlq $42, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    shlq $43, %r15
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r9, %r14
+; AVX512F-ONLY-NEXT:    shlq $45, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    shlq $46, %r9
+; AVX512F-ONLY-NEXT:    orq %r14, %r9
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $48, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $49, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $51, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $52, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $53, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $54, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $56, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $58, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $59, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $60, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $61, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    shlq $62, %r10
+; AVX512F-ONLY-NEXT:    orq %rcx, %r10
+; AVX512F-ONLY-NEXT:    shlq $63, %rbx
+; AVX512F-ONLY-NEXT:    orq %r10, %rbx
+; AVX512F-ONLY-NEXT:    orq %r11, %rbx
+; AVX512F-ONLY-NEXT:    movq %rbx, 8(%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r13
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor3_vf64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k3
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k1
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k2
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ecx
+; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %edx
+; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %edi
+; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r8d
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r9d
+; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r11d
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r14d
+; AVX512DQ-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r10d
+; AVX512DQ-NEXT:    kshiftrw $1, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ebx
+; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k3, %r12d
+; AVX512DQ-NEXT:    movzbl %r12b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    leaq (%r12,%r12,2), %r13
+; AVX512DQ-NEXT:    leaq (%r13,%r12,4), %r12
+; AVX512DQ-NEXT:    movzbl %bl, %ebx
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    leaq (%r12,%rbx,8), %r12
+; AVX512DQ-NEXT:    movq %rbx, %r13
+; AVX512DQ-NEXT:    shlq $4, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    shlq $5, %rbx
+; AVX512DQ-NEXT:    orq %r13, %rbx
+; AVX512DQ-NEXT:    movzbl %r10b, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $6, %r12
+; AVX512DQ-NEXT:    movq %r10, %r13
+; AVX512DQ-NEXT:    shlq $7, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $4, %k3, %k4
+; AVX512DQ-NEXT:    shlq $8, %r10
+; AVX512DQ-NEXT:    orq %r13, %r10
+; AVX512DQ-NEXT:    movzbl %r12b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $9, %r13
+; AVX512DQ-NEXT:    orq %r10, %r13
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $10, %r10
+; AVX512DQ-NEXT:    orq %r13, %r10
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $5, %k3, %k4
+; AVX512DQ-NEXT:    shlq $11, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    movzbl %r13b, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r13
+; AVX512DQ-NEXT:    shlq $12, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $13, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512DQ-NEXT:    shlq $14, %r10
+; AVX512DQ-NEXT:    orq %r12, %r10
+; AVX512DQ-NEXT:    movzbl %r13b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $15, %r13
+; AVX512DQ-NEXT:    orq %r10, %r13
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $16, %r10
+; AVX512DQ-NEXT:    orq %r13, %r10
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512DQ-NEXT:    shlq $17, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    movzbl %r13b, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r13
+; AVX512DQ-NEXT:    shlq $18, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $19, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512DQ-NEXT:    shlq $20, %r10
+; AVX512DQ-NEXT:    orq %r12, %r10
+; AVX512DQ-NEXT:    movzbl %r13b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $21, %r13
+; AVX512DQ-NEXT:    orq %r10, %r13
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $22, %r10
+; AVX512DQ-NEXT:    orq %r13, %r10
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $9, %k3, %k4
+; AVX512DQ-NEXT:    shlq $23, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    movzbl %r13b, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r13
+; AVX512DQ-NEXT:    shlq $24, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $25, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k4
+; AVX512DQ-NEXT:    shlq $26, %r10
+; AVX512DQ-NEXT:    orq %r12, %r10
+; AVX512DQ-NEXT:    movzbl %r13b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $27, %r13
+; AVX512DQ-NEXT:    orq %r10, %r13
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $28, %r10
+; AVX512DQ-NEXT:    orq %r13, %r10
+; AVX512DQ-NEXT:    shlq $29, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    movzbl %r15b, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $30, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %r10d
+; AVX512DQ-NEXT:    kshiftrw $11, %k3, %k4
+; AVX512DQ-NEXT:    shlq $31, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $12, %k3, %k4
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    shlq $32, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $33, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $34, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k4
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $36, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $37, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512DQ-NEXT:    shlq $38, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $39, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $40, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $15, %k3, %k3
+; AVX512DQ-NEXT:    shlq $41, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $42, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $43, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    kmovw %k3, %r12d
+; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k3
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $45, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    shlq $47, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $48, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $49, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %r15d
+; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k3
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %r12d
+; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k3
+; AVX512DQ-NEXT:    shlq $53, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $55, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k3
+; AVX512DQ-NEXT:    shlq $56, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $57, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $58, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %r13d
+; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k3
+; AVX512DQ-NEXT:    shlq $59, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $60, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $61, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %r15d
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k3
+; AVX512DQ-NEXT:    shlq $62, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movzbl %r15b, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $63, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %r12d
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k3
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %rax, (%rsi)
+; AVX512DQ-NEXT:    movzbl %r12b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movzbl %r10b, %ebx
+; AVX512DQ-NEXT:    movl %ebx, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    leaq (%r12,%rax,2), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%rax,4), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%rax,8), %rax
+; AVX512DQ-NEXT:    movzbl %r14b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $4, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $5, %r14
+; AVX512DQ-NEXT:    orq %r13, %r14
+; AVX512DQ-NEXT:    shlq $6, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $7, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $8, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k3
+; AVX512DQ-NEXT:    shlq $9, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $10, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $11, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512DQ-NEXT:    shlq $12, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $13, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $14, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    shlq $15, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $16, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $17, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k3
+; AVX512DQ-NEXT:    shlq $18, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $19, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $20, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k3
+; AVX512DQ-NEXT:    shlq $21, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $22, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k3
+; AVX512DQ-NEXT:    shlq $24, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $25, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $26, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k3
+; AVX512DQ-NEXT:    shlq $27, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $28, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $29, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %r12d
+; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k3
+; AVX512DQ-NEXT:    shlq $30, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movzbl %r12b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $32, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k3
+; AVX512DQ-NEXT:    shlq $33, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $34, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $35, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %r13d
+; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k3
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $37, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %r12d
+; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k3
+; AVX512DQ-NEXT:    shlq $39, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $41, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k3
+; AVX512DQ-NEXT:    shlq $42, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $43, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $44, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %r13d
+; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k3
+; AVX512DQ-NEXT:    shlq $45, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $46, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %r12d
+; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k3
+; AVX512DQ-NEXT:    shlq $48, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $49, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $50, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k3
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $52, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $53, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %r13d
+; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k2
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $55, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512DQ-NEXT:    shlq $57, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $58, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $59, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    shlq $60, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movl %r11d, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $61, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512DQ-NEXT:    shlq $62, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512DQ-NEXT:    movzbl %r11b, %r11d
+; AVX512DQ-NEXT:    shlq $63, %r11
+; AVX512DQ-NEXT:    orq %rax, %r11
+; AVX512DQ-NEXT:    kmovw %k2, %r13d
+; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512DQ-NEXT:    orq %r14, %r11
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k1
+; AVX512DQ-NEXT:    movq %r11, 16(%rsi)
+; AVX512DQ-NEXT:    kmovw %k1, %r11d
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512DQ-NEXT:    movzbl %r11b, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    leaq (%r15,%r15,2), %r11
+; AVX512DQ-NEXT:    leaq (%r11,%r14,4), %r11
+; AVX512DQ-NEXT:    leaq (%r11,%r14,8), %r11
+; AVX512DQ-NEXT:    shlq $4, %r14
+; AVX512DQ-NEXT:    orq %r11, %r14
+; AVX512DQ-NEXT:    movzbl %al, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r11
+; AVX512DQ-NEXT:    shlq $5, %r11
+; AVX512DQ-NEXT:    orq %r14, %r11
+; AVX512DQ-NEXT:    movq %rax, %r14
+; AVX512DQ-NEXT:    shlq $6, %r14
+; AVX512DQ-NEXT:    shlq $7, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movzbl %r13b, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %r13
+; AVX512DQ-NEXT:    shlq $8, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r14, %r15
+; AVX512DQ-NEXT:    shlq $9, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    kmovw %k1, %r13d
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512DQ-NEXT:    shlq $10, %r14
+; AVX512DQ-NEXT:    orq %r15, %r14
+; AVX512DQ-NEXT:    movzbl %r12b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $11, %r15
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $12, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    kmovw %k1, %r14d
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512DQ-NEXT:    shlq $13, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $14, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $15, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %r15d
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512DQ-NEXT:    shlq $16, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %r9b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $17, %r9
+; AVX512DQ-NEXT:    orq %r12, %r9
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $18, %r12
+; AVX512DQ-NEXT:    orq %r9, %r12
+; AVX512DQ-NEXT:    kmovw %k1, %r9d
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512DQ-NEXT:    shlq $19, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %r8b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r8
+; AVX512DQ-NEXT:    shlq $20, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $21, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %r8d
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512DQ-NEXT:    shlq $22, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %dil, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $23, %rdi
+; AVX512DQ-NEXT:    orq %r12, %rdi
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $24, %r12
+; AVX512DQ-NEXT:    orq %rdi, %r12
+; AVX512DQ-NEXT:    kmovw %k1, %edi
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512DQ-NEXT:    shlq $25, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %dl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rdx
+; AVX512DQ-NEXT:    shlq $26, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $27, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    shlq $28, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %cl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $29, %rcx
+; AVX512DQ-NEXT:    orq %r12, %rcx
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $30, %r12
+; AVX512DQ-NEXT:    orq %rcx, %r12
+; AVX512DQ-NEXT:    kmovw %k0, %ebp
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movq %rbp, %rcx
+; AVX512DQ-NEXT:    shlq $32, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    shlq $34, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %rbp, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $36, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $37, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $39, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $40, %r14
+; AVX512DQ-NEXT:    orq %r12, %r14
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $41, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r15, %r14
+; AVX512DQ-NEXT:    shlq $42, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    shlq $43, %r15
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r9, %r14
+; AVX512DQ-NEXT:    shlq $45, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    shlq $46, %r9
+; AVX512DQ-NEXT:    orq %r14, %r9
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $48, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    shlq $49, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $51, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    shlq $52, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $53, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $54, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $56, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $58, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $59, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $60, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $61, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    shlq $62, %r10
+; AVX512DQ-NEXT:    orq %rcx, %r10
+; AVX512DQ-NEXT:    shlq $63, %rbx
+; AVX512DQ-NEXT:    orq %r10, %rbx
+; AVX512DQ-NEXT:    orq %r11, %rbx
+; AVX512DQ-NEXT:    movq %rbx, 8(%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor3_vf64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r13
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrq $31, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrq $30, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrq $29, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrq $28, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrq $27, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrq $26, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrq $63, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrq $45, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $44, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $10, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrq $2, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrq $1, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrq $3, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k0, %r12d
+; AVX512BW-NEXT:    movzbl %r12b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    leaq (%r12,%r12,2), %r13
+; AVX512BW-NEXT:    leaq (%r13,%r12,4), %r12
+; AVX512BW-NEXT:    movzbl %bl, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    leaq (%r12,%rbx,8), %r12
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $4, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    shlq $5, %rbx
+; AVX512BW-NEXT:    orq %r13, %rbx
+; AVX512BW-NEXT:    movzbl %r10b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $6, %r12
+; AVX512BW-NEXT:    movq %r10, %r13
+; AVX512BW-NEXT:    shlq $7, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $4, %k0, %k1
+; AVX512BW-NEXT:    shlq $8, %r10
+; AVX512BW-NEXT:    orq %r13, %r10
+; AVX512BW-NEXT:    movzbl %r12b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $9, %r13
+; AVX512BW-NEXT:    orq %r10, %r13
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $10, %r10
+; AVX512BW-NEXT:    orq %r13, %r10
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $5, %k0, %k1
+; AVX512BW-NEXT:    shlq $11, %r12
+; AVX512BW-NEXT:    orq %r10, %r12
+; AVX512BW-NEXT:    movzbl %r13b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %r13
+; AVX512BW-NEXT:    shlq $12, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $13, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
+; AVX512BW-NEXT:    shlq $14, %r10
+; AVX512BW-NEXT:    orq %r12, %r10
+; AVX512BW-NEXT:    movzbl %r13b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $15, %r13
+; AVX512BW-NEXT:    orq %r10, %r13
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $16, %r10
+; AVX512BW-NEXT:    orq %r13, %r10
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $7, %k0, %k1
+; AVX512BW-NEXT:    shlq $17, %r12
+; AVX512BW-NEXT:    orq %r10, %r12
+; AVX512BW-NEXT:    movzbl %r13b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %r13
+; AVX512BW-NEXT:    shlq $18, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $19, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $8, %k0, %k1
+; AVX512BW-NEXT:    shlq $20, %r10
+; AVX512BW-NEXT:    orq %r12, %r10
+; AVX512BW-NEXT:    movzbl %r13b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $21, %r13
+; AVX512BW-NEXT:    orq %r10, %r13
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $22, %r10
+; AVX512BW-NEXT:    orq %r13, %r10
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $9, %k0, %k1
+; AVX512BW-NEXT:    shlq $23, %r12
+; AVX512BW-NEXT:    orq %r10, %r12
+; AVX512BW-NEXT:    movzbl %r13b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %r13
+; AVX512BW-NEXT:    shlq $24, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $25, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $42, %k0, %k1
+; AVX512BW-NEXT:    shlq $26, %r10
+; AVX512BW-NEXT:    orq %r12, %r10
+; AVX512BW-NEXT:    movzbl %r13b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $27, %r13
+; AVX512BW-NEXT:    orq %r10, %r13
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $28, %r10
+; AVX512BW-NEXT:    orq %r13, %r10
+; AVX512BW-NEXT:    shlq $29, %r12
+; AVX512BW-NEXT:    orq %r10, %r12
+; AVX512BW-NEXT:    movzbl %r14b, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrq $11, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $12, %k0, %k1
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    shlq $32, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $33, %r12
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    shlq $34, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $13, %k0, %k1
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $36, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $37, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrq $14, %k0, %k1
+; AVX512BW-NEXT:    shlq $38, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $39, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $40, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $15, %k0, %k1
+; AVX512BW-NEXT:    shlq $41, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $42, %r12
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    shlq $43, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $16, %k0, %k1
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $45, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrq $17, %k0, %k1
+; AVX512BW-NEXT:    shlq $47, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $49, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $18, %k0, %k1
+; AVX512BW-NEXT:    shlq $50, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $51, %r12
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    shlq $52, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $19, %k0, %k1
+; AVX512BW-NEXT:    shlq $53, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $54, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $55, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrq $20, %k0, %k1
+; AVX512BW-NEXT:    shlq $56, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $57, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $58, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $21, %k0, %k1
+; AVX512BW-NEXT:    shlq $59, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $60, %r12
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    shlq $61, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $43, %k0, %k1
+; AVX512BW-NEXT:    shlq $62, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movzbl %r12b, %r14d
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $63, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $46, %k0, %k1
+; AVX512BW-NEXT:    orq %rbx, %r12
+; AVX512BW-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    movq %r12, (%rsi)
+; AVX512BW-NEXT:    movzbl %al, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movzbl %r10b, %ebx
+; AVX512BW-NEXT:    movl %ebx, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    leaq (%r12,%rax,2), %r12
+; AVX512BW-NEXT:    leaq (%r12,%rax,4), %r12
+; AVX512BW-NEXT:    leaq (%r12,%rax,8), %rax
+; AVX512BW-NEXT:    movzbl %r15b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $4, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $5, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    shlq $6, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $7, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $8, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $47, %k0, %k1
+; AVX512BW-NEXT:    shlq $9, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $10, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $11, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $48, %k0, %k1
+; AVX512BW-NEXT:    shlq $12, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $13, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $14, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $49, %k0, %k1
+; AVX512BW-NEXT:    shlq $15, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $16, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $17, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $50, %k0, %k1
+; AVX512BW-NEXT:    shlq $18, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $19, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $20, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $51, %k0, %k1
+; AVX512BW-NEXT:    shlq $21, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $22, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $23, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $52, %k0, %k1
+; AVX512BW-NEXT:    shlq $24, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $25, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $26, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $53, %k0, %k1
+; AVX512BW-NEXT:    shlq $27, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $28, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $29, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $54, %k0, %k1
+; AVX512BW-NEXT:    shlq $30, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %r12b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $32, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $55, %k0, %k1
+; AVX512BW-NEXT:    shlq $33, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $34, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $35, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $56, %k0, %k1
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $37, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $57, %k0, %k1
+; AVX512BW-NEXT:    shlq $39, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $41, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $58, %k0, %k1
+; AVX512BW-NEXT:    shlq $42, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $43, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $44, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $59, %k0, %k1
+; AVX512BW-NEXT:    shlq $45, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $46, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $47, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $60, %k0, %k1
+; AVX512BW-NEXT:    shlq $48, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $49, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $50, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $61, %k0, %k1
+; AVX512BW-NEXT:    shlq $51, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $52, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $53, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $62, %k0, %k1
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $55, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $25, %k0, %k1
+; AVX512BW-NEXT:    shlq $57, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $59, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    shlq $60, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movl %r11d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $24, %k0, %k1
+; AVX512BW-NEXT:    shlq $62, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $23, %k0, %k1
+; AVX512BW-NEXT:    movzbl %r11b, %r13d
+; AVX512BW-NEXT:    shlq $63, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrq $22, %k0, %k1
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512BW-NEXT:    movq %r13, 16(%rsi)
+; AVX512BW-NEXT:    kmovd %k1, %esi
+; AVX512BW-NEXT:    kshiftrq $33, %k0, %k1
+; AVX512BW-NEXT:    movzbl %r15b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    leaq (%r14,%r14,2), %r14
+; AVX512BW-NEXT:    leaq (%r14,%rax,4), %r14
+; AVX512BW-NEXT:    leaq (%r14,%rax,8), %r14
+; AVX512BW-NEXT:    shlq $4, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movzbl %r11b, %r14d
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %r11
+; AVX512BW-NEXT:    shlq $5, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    shlq $7, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movzbl %r12b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    shlq $8, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrq $34, %k0, %k1
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $11, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $35, %k0, %k1
+; AVX512BW-NEXT:    shlq $13, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %r9b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $14, %r9
+; AVX512BW-NEXT:    orq %r12, %r9
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $15, %r12
+; AVX512BW-NEXT:    orq %r9, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrq $36, %k0, %k1
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %r8b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r8
+; AVX512BW-NEXT:    shlq $17, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrq $37, %k0, %k1
+; AVX512BW-NEXT:    shlq $19, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %dil, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $20, %rdi
+; AVX512BW-NEXT:    orq %r12, %rdi
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $21, %r12
+; AVX512BW-NEXT:    orq %rdi, %r12
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrq $38, %k0, %k1
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %dl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rdx
+; AVX512BW-NEXT:    shlq $23, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrq $39, %k0, %k1
+; AVX512BW-NEXT:    shlq $25, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %cl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $26, %rcx
+; AVX512BW-NEXT:    orq %r12, %rcx
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $27, %r12
+; AVX512BW-NEXT:    orq %rcx, %r12
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrq $40, %k0, %k1
+; AVX512BW-NEXT:    shlq $28, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rbp
+; AVX512BW-NEXT:    shlq $29, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $41, %k0, %k0
+; AVX512BW-NEXT:    shlq $31, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %esi
+; AVX512BW-NEXT:    movq %rsi, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rsi, %r13
+; AVX512BW-NEXT:    shlq $33, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k0, %r12d
+; AVX512BW-NEXT:    shlq $34, %rsi
+; AVX512BW-NEXT:    orq %r13, %rsi
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %rsi, %rax
+; AVX512BW-NEXT:    movq %r14, %rsi
+; AVX512BW-NEXT:    shlq $36, %rsi
+; AVX512BW-NEXT:    orq %rax, %rsi
+; AVX512BW-NEXT:    shlq $37, %r14
+; AVX512BW-NEXT:    orq %rsi, %r14
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r15, %rsi
+; AVX512BW-NEXT:    shlq $39, %rsi
+; AVX512BW-NEXT:    orq %rax, %rsi
+; AVX512BW-NEXT:    shlq $40, %r15
+; AVX512BW-NEXT:    orq %rsi, %r15
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $41, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r9, %rsi
+; AVX512BW-NEXT:    shlq $42, %rsi
+; AVX512BW-NEXT:    orq %rax, %rsi
+; AVX512BW-NEXT:    shlq $43, %r9
+; AVX512BW-NEXT:    orq %rsi, %r9
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r8, %rsi
+; AVX512BW-NEXT:    shlq $45, %rsi
+; AVX512BW-NEXT:    orq %rax, %rsi
+; AVX512BW-NEXT:    shlq $46, %r8
+; AVX512BW-NEXT:    orq %rsi, %r8
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $47, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdi, %rsi
+; AVX512BW-NEXT:    shlq $48, %rsi
+; AVX512BW-NEXT:    orq %rax, %rsi
+; AVX512BW-NEXT:    shlq $49, %rdi
+; AVX512BW-NEXT:    orq %rsi, %rdi
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movq %rdx, %rsi
+; AVX512BW-NEXT:    shlq $51, %rsi
+; AVX512BW-NEXT:    orq %rax, %rsi
+; AVX512BW-NEXT:    shlq $52, %rdx
+; AVX512BW-NEXT:    orq %rsi, %rdx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $53, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $54, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $55, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rcx
+; AVX512BW-NEXT:    shlq $57, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $58, %rbp
+; AVX512BW-NEXT:    orq %rcx, %rbp
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $59, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $60, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $61, %r12
+; AVX512BW-NEXT:    orq %rcx, %r12
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    shlq $62, %r10
+; AVX512BW-NEXT:    orq %r12, %r10
+; AVX512BW-NEXT:    shlq $63, %rbx
+; AVX512BW-NEXT:    orq %r10, %rbx
+; AVX512BW-NEXT:    orq %r11, %rbx
+; AVX512BW-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT:    movq %rbx, 8(%rax)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r13
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <64 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
+  store <192 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor4_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-SLOW-LABEL: mask_replication_factor4_vf2:
+; AVX512F-SLOW:       # %bb.0:
+; AVX512F-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512F-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512F-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512F-SLOW-NEXT:    kmovw %k0, %eax
+; AVX512F-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512F-SLOW-NEXT:    vzeroupper
+; AVX512F-SLOW-NEXT:    retq
+;
+; AVX512F-FAST-LABEL: mask_replication_factor4_vf2:
+; AVX512F-FAST:       # %bb.0:
+; AVX512F-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512F-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512F-FAST-NEXT:    kmovw %k0, %eax
+; AVX512F-FAST-NEXT:    movb %al, (%rsi)
+; AVX512F-FAST-NEXT:    vzeroupper
+; AVX512F-FAST-NEXT:    retq
+;
+; AVX512DQ-SLOW-LABEL: mask_replication_factor4_vf2:
+; AVX512DQ-SLOW:       # %bb.0:
+; AVX512DQ-SLOW-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %ymm0, %k0
+; AVX512DQ-SLOW-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-SLOW-NEXT:    vzeroupper
+; AVX512DQ-SLOW-NEXT:    retq
+;
+; AVX512DQ-FAST-LABEL: mask_replication_factor4_vf2:
+; AVX512DQ-FAST:       # %bb.0:
+; AVX512DQ-FAST-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %ymm0
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vpmovd2m %ymm0, %k0
+; AVX512DQ-FAST-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-FAST-NEXT:    vzeroupper
+; AVX512DQ-FAST-NEXT:    retq
+;
+; AVX512BW-SLOW-LABEL: mask_replication_factor4_vf2:
+; AVX512BW-SLOW:       # %bb.0:
+; AVX512BW-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512BW-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512BW-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512BW-SLOW-NEXT:    kmovd %k0, %eax
+; AVX512BW-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512BW-SLOW-NEXT:    vzeroupper
+; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: mask_replication_factor4_vf2:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512BW-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX512BW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512BW-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512BW-FAST-NEXT:    kmovd %k0, %eax
+; AVX512BW-FAST-NEXT:    movb %al, (%rsi)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
+;
+; AVX512VBMI-SLOW-LABEL: mask_replication_factor4_vf2:
+; AVX512VBMI-SLOW:       # %bb.0:
+; AVX512VBMI-SLOW-NEXT:    kmovw (%rdi), %k1
+; AVX512VBMI-SLOW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VBMI-SLOW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VBMI-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VBMI-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512VBMI-SLOW-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512VBMI-SLOW-NEXT:    kmovd %k0, %eax
+; AVX512VBMI-SLOW-NEXT:    movb %al, (%rsi)
+; AVX512VBMI-SLOW-NEXT:    vzeroupper
+; AVX512VBMI-SLOW-NEXT:    retq
+;
+; AVX512VBMI-FAST-LABEL: mask_replication_factor4_vf2:
+; AVX512VBMI-FAST:       # %bb.0:
+; AVX512VBMI-FAST-NEXT:    kmovw (%rdi), %k1
+; AVX512VBMI-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VBMI-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VBMI-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX512VBMI-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VBMI-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
+; AVX512VBMI-FAST-NEXT:    kmovd %k0, %eax
+; AVX512VBMI-FAST-NEXT:    movb %al, (%rsi)
+; AVX512VBMI-FAST-NEXT:    vzeroupper
+; AVX512VBMI-FAST-NEXT:    retq
+  %src.vec = load <2 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  store <8 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor4_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor4_vf4:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor4_vf4:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor4_vf4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kmovw %k0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %src.vec = load <4 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+  store <16 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor4_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor4_vf8:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor4_vf8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor4_vf8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %src.vec = load <8 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+  store <32 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor4_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor4_vf16:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
+; AVX512F-ONLY-NEXT:    kmovw %k3, 4(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k2, 6(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor4_vf16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
+; AVX512DQ-NEXT:    kmovw %k3, 4(%rsi)
+; AVX512DQ-NEXT:    kmovw %k2, 6(%rsi)
+; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.vec = load <16 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
+  store <64 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor4_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor4_vf32:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k3
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm3
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm4
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm4, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k4
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm2, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k6
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm4, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k7
+; AVX512F-ONLY-NEXT:    kmovw %k7, 12(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k6, 14(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k5, 8(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k3, 10(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k4, 4(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k2, 6(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k1, (%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor4_vf32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k3
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm3
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm4
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm4, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k4
+; AVX512DQ-NEXT:    vpmovm2d %k3, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k6
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm4, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k7
+; AVX512DQ-NEXT:    kmovw %k7, 12(%rsi)
+; AVX512DQ-NEXT:    kmovw %k6, 14(%rsi)
+; AVX512DQ-NEXT:    kmovw %k5, 8(%rsi)
+; AVX512DQ-NEXT:    kmovw %k3, 10(%rsi)
+; AVX512DQ-NEXT:    kmovw %k4, 4(%rsi)
+; AVX512DQ-NEXT:    kmovw %k2, 6(%rsi)
+; AVX512DQ-NEXT:    kmovw %k1, (%rsi)
+; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf32:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3,2,3,2,3]
+; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512BW-ONLY-NEXT:    kmovq %k1, 8(%rsi)
+; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf32:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovd (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512VBMI-ONLY-NEXT:    kmovq %k1, 8(%rsi)
+; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.vec = load <32 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
+  store <128 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor4_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor4_vf64:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm4
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm0
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm3, %zmm10
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm5, %zmm11
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm7, %zmm12
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm9, %zmm1
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm3, %zmm13
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm5, %zmm14
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm7, %zmm15
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm9, %zmm2
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm3, %zmm3
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm5, %zmm5
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm7, %zmm7
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm9, %zmm9
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 28(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 30(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 24(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 26(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 20(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 22(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 16(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 18(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 12(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 14(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 8(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 10(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 4(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 6(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor4_vf64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm3, %zmm10
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm5, %zmm11
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm7, %zmm12
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm9, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm3, %zmm13
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm5, %zmm14
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm7, %zmm15
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm9, %zmm2
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm16
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 28(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 30(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 24(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 26(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 20(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 22(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 16(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 18(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 12(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 14(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 8(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 10(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 4(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 6(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf64:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
+; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7]
+; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5,4,5,4,5]
+; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k3
+; AVX512BW-ONLY-NEXT:    kmovq %k3, 16(%rsi)
+; AVX512BW-ONLY-NEXT:    kmovq %k2, 24(%rsi)
+; AVX512BW-ONLY-NEXT:    kmovq %k1, 8(%rsi)
+; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf64:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovq (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [48,48,48,48,49,49,49,49,50,50,50,50,51,51,51,51,52,52,52,52,53,53,53,53,54,54,54,54,55,55,55,55,56,56,56,56,57,57,57,57,58,58,58,58,59,59,59,59,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm1
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,33,33,33,33,34,34,34,34,35,35,35,35,36,36,36,36,37,37,37,37,38,38,38,38,39,39,39,39,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,44,44,44,44,45,45,45,45,46,46,46,46,47,47,47,47]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k3
+; AVX512VBMI-ONLY-NEXT:    kmovq %k3, 16(%rsi)
+; AVX512VBMI-ONLY-NEXT:    kmovq %k2, 24(%rsi)
+; AVX512VBMI-ONLY-NEXT:    kmovq %k1, 8(%rsi)
+; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.vec = load <64 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
+  store <256 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor5_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor5_vf2:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512F-ONLY-NEXT:    shll $4, %eax
+; AVX512F-ONLY-NEXT:    orl %ecx, %eax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ecx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $5, %edi
+; AVX512F-ONLY-NEXT:    orl %eax, %edi
+; AVX512F-ONLY-NEXT:    movl %edx, %eax
+; AVX512F-ONLY-NEXT:    shll $6, %eax
+; AVX512F-ONLY-NEXT:    movl %edx, %r8d
+; AVX512F-ONLY-NEXT:    shll $7, %r8d
+; AVX512F-ONLY-NEXT:    orl %eax, %r8d
+; AVX512F-ONLY-NEXT:    shll $8, %edx
+; AVX512F-ONLY-NEXT:    orl %r8d, %edx
+; AVX512F-ONLY-NEXT:    shll $9, %ecx
+; AVX512F-ONLY-NEXT:    orl %edx, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    andl $1023, %ecx # imm = 0x3FF
+; AVX512F-ONLY-NEXT:    movw %cx, (%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor5_vf2:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512DQ-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512DQ-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512DQ-NEXT:    shll $4, %eax
+; AVX512DQ-NEXT:    orl %ecx, %eax
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $5, %edi
+; AVX512DQ-NEXT:    orl %eax, %edi
+; AVX512DQ-NEXT:    movl %edx, %eax
+; AVX512DQ-NEXT:    shll $6, %eax
+; AVX512DQ-NEXT:    movl %edx, %r8d
+; AVX512DQ-NEXT:    shll $7, %r8d
+; AVX512DQ-NEXT:    orl %eax, %r8d
+; AVX512DQ-NEXT:    shll $8, %edx
+; AVX512DQ-NEXT:    orl %r8d, %edx
+; AVX512DQ-NEXT:    shll $9, %ecx
+; AVX512DQ-NEXT:    orl %edx, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    andl $1023, %ecx # imm = 0x3FF
+; AVX512DQ-NEXT:    movw %cx, (%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor5_vf2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512BW-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512BW-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512BW-NEXT:    shll $4, %eax
+; AVX512BW-NEXT:    orl %ecx, %eax
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $5, %edi
+; AVX512BW-NEXT:    orl %eax, %edi
+; AVX512BW-NEXT:    movl %edx, %eax
+; AVX512BW-NEXT:    shll $6, %eax
+; AVX512BW-NEXT:    movl %edx, %r8d
+; AVX512BW-NEXT:    shll $7, %r8d
+; AVX512BW-NEXT:    orl %eax, %r8d
+; AVX512BW-NEXT:    shll $8, %edx
+; AVX512BW-NEXT:    orl %r8d, %edx
+; AVX512BW-NEXT:    shll $9, %ecx
+; AVX512BW-NEXT:    orl %edx, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    andl $1023, %ecx # imm = 0x3FF
+; AVX512BW-NEXT:    movw %cx, (%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <2 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
+  store <10 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor5_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor5_vf4:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k2
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rcx,2), %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rcx,4), %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rcx,8), %eax
+; AVX512F-ONLY-NEXT:    shll $4, %ecx
+; AVX512F-ONLY-NEXT:    orl %eax, %ecx
+; AVX512F-ONLY-NEXT:    kmovw %k3, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movl %edx, %eax
+; AVX512F-ONLY-NEXT:    shll $5, %eax
+; AVX512F-ONLY-NEXT:    orl %ecx, %eax
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $6, %ecx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $7, %edi
+; AVX512F-ONLY-NEXT:    orl %ecx, %edi
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $8, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    shll $9, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movl %edi, %ecx
+; AVX512F-ONLY-NEXT:    shll $10, %ecx
+; AVX512F-ONLY-NEXT:    orl %edx, %ecx
+; AVX512F-ONLY-NEXT:    movl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $11, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    movl %edi, %ecx
+; AVX512F-ONLY-NEXT:    shll $12, %ecx
+; AVX512F-ONLY-NEXT:    orl %edx, %ecx
+; AVX512F-ONLY-NEXT:    movl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $13, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $14, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movl %edx, %r8d
+; AVX512F-ONLY-NEXT:    shll $15, %r8d
+; AVX512F-ONLY-NEXT:    orl %edi, %r8d
+; AVX512F-ONLY-NEXT:    orl %eax, %r8d
+; AVX512F-ONLY-NEXT:    movw %r8w, (%rsi)
+; AVX512F-ONLY-NEXT:    movl %edx, %eax
+; AVX512F-ONLY-NEXT:    shll $16, %eax
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $17, %edi
+; AVX512F-ONLY-NEXT:    orl %eax, %edi
+; AVX512F-ONLY-NEXT:    shll $18, %edx
+; AVX512F-ONLY-NEXT:    orl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $19, %ecx
+; AVX512F-ONLY-NEXT:    orl %edx, %ecx
+; AVX512F-ONLY-NEXT:    orl %r8d, %ecx
+; AVX512F-ONLY-NEXT:    shrl $16, %ecx
+; AVX512F-ONLY-NEXT:    andl $15, %ecx
+; AVX512F-ONLY-NEXT:    movb %cl, 2(%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor5_vf4:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k2
+; AVX512DQ-NEXT:    kshiftrb $3, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrb $2, %k2, %k1
+; AVX512DQ-NEXT:    kshiftrb $1, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %k2, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    leal (%rcx,%rcx,2), %eax
+; AVX512DQ-NEXT:    leal (%rax,%rcx,4), %eax
+; AVX512DQ-NEXT:    leal (%rax,%rcx,8), %eax
+; AVX512DQ-NEXT:    shll $4, %ecx
+; AVX512DQ-NEXT:    orl %eax, %ecx
+; AVX512DQ-NEXT:    kmovw %k3, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movl %edx, %eax
+; AVX512DQ-NEXT:    shll $5, %eax
+; AVX512DQ-NEXT:    orl %ecx, %eax
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $6, %ecx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $7, %edi
+; AVX512DQ-NEXT:    orl %ecx, %edi
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $8, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    shll $9, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    kmovw %k1, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movl %edi, %ecx
+; AVX512DQ-NEXT:    shll $10, %ecx
+; AVX512DQ-NEXT:    orl %edx, %ecx
+; AVX512DQ-NEXT:    movl %edi, %edx
+; AVX512DQ-NEXT:    shll $11, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    movl %edi, %ecx
+; AVX512DQ-NEXT:    shll $12, %ecx
+; AVX512DQ-NEXT:    orl %edx, %ecx
+; AVX512DQ-NEXT:    movl %edi, %edx
+; AVX512DQ-NEXT:    shll $13, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    shll $14, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movl %edx, %r8d
+; AVX512DQ-NEXT:    shll $15, %r8d
+; AVX512DQ-NEXT:    orl %edi, %r8d
+; AVX512DQ-NEXT:    orl %eax, %r8d
+; AVX512DQ-NEXT:    movw %r8w, (%rsi)
+; AVX512DQ-NEXT:    movl %edx, %eax
+; AVX512DQ-NEXT:    shll $16, %eax
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $17, %edi
+; AVX512DQ-NEXT:    orl %eax, %edi
+; AVX512DQ-NEXT:    shll $18, %edx
+; AVX512DQ-NEXT:    orl %edi, %edx
+; AVX512DQ-NEXT:    shll $19, %ecx
+; AVX512DQ-NEXT:    orl %edx, %ecx
+; AVX512DQ-NEXT:    orl %r8d, %ecx
+; AVX512DQ-NEXT:    shrl $16, %ecx
+; AVX512DQ-NEXT:    andl $15, %ecx
+; AVX512DQ-NEXT:    movb %cl, 2(%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor5_vf4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k2
+; AVX512BW-NEXT:    kshiftrw $3, %k2, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k2, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512BW-NEXT:    kmovd %k2, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    leal (%rcx,%rcx,2), %eax
+; AVX512BW-NEXT:    leal (%rax,%rcx,4), %eax
+; AVX512BW-NEXT:    leal (%rax,%rcx,8), %eax
+; AVX512BW-NEXT:    shll $4, %ecx
+; AVX512BW-NEXT:    orl %eax, %ecx
+; AVX512BW-NEXT:    kmovd %k3, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movl %edx, %eax
+; AVX512BW-NEXT:    shll $5, %eax
+; AVX512BW-NEXT:    orl %ecx, %eax
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $6, %ecx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $7, %edi
+; AVX512BW-NEXT:    orl %ecx, %edi
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $8, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    shll $9, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movl %edi, %ecx
+; AVX512BW-NEXT:    shll $10, %ecx
+; AVX512BW-NEXT:    orl %edx, %ecx
+; AVX512BW-NEXT:    movl %edi, %edx
+; AVX512BW-NEXT:    shll $11, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    movl %edi, %ecx
+; AVX512BW-NEXT:    shll $12, %ecx
+; AVX512BW-NEXT:    orl %edx, %ecx
+; AVX512BW-NEXT:    movl %edi, %edx
+; AVX512BW-NEXT:    shll $13, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    shll $14, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movl %edx, %r8d
+; AVX512BW-NEXT:    shll $15, %r8d
+; AVX512BW-NEXT:    orl %edi, %r8d
+; AVX512BW-NEXT:    orl %eax, %r8d
+; AVX512BW-NEXT:    movw %r8w, (%rsi)
+; AVX512BW-NEXT:    movl %edx, %eax
+; AVX512BW-NEXT:    shll $16, %eax
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $17, %edi
+; AVX512BW-NEXT:    orl %eax, %edi
+; AVX512BW-NEXT:    shll $18, %edx
+; AVX512BW-NEXT:    orl %edi, %edx
+; AVX512BW-NEXT:    shll $19, %ecx
+; AVX512BW-NEXT:    orl %edx, %ecx
+; AVX512BW-NEXT:    orl %r8d, %ecx
+; AVX512BW-NEXT:    shrl $16, %ecx
+; AVX512BW-NEXT:    andl $15, %ecx
+; AVX512BW-NEXT:    movb %cl, 2(%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <4 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
+  store <20 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor5_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor5_vf8:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r11d
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    leaq (%rbx,%rbx,2), %r11
+; AVX512F-ONLY-NEXT:    leaq (%r11,%rbx,4), %r11
+; AVX512F-ONLY-NEXT:    leaq (%r11,%rbx,8), %r11
+; AVX512F-ONLY-NEXT:    shlq $4, %rbx
+; AVX512F-ONLY-NEXT:    orq %r11, %rbx
+; AVX512F-ONLY-NEXT:    movzbl %dl, %r11d
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    movq %r11, %rdx
+; AVX512F-ONLY-NEXT:    shlq $5, %rdx
+; AVX512F-ONLY-NEXT:    orq %rbx, %rdx
+; AVX512F-ONLY-NEXT:    movq %r11, %rbx
+; AVX512F-ONLY-NEXT:    shlq $6, %rbx
+; AVX512F-ONLY-NEXT:    movq %r11, %r14
+; AVX512F-ONLY-NEXT:    shlq $7, %r14
+; AVX512F-ONLY-NEXT:    orq %rbx, %r14
+; AVX512F-ONLY-NEXT:    movq %r11, %rbx
+; AVX512F-ONLY-NEXT:    shlq $8, %rbx
+; AVX512F-ONLY-NEXT:    orq %r14, %rbx
+; AVX512F-ONLY-NEXT:    shlq $9, %r11
+; AVX512F-ONLY-NEXT:    orq %rbx, %r11
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rbx
+; AVX512F-ONLY-NEXT:    shlq $10, %rbx
+; AVX512F-ONLY-NEXT:    orq %r11, %rbx
+; AVX512F-ONLY-NEXT:    movq %r10, %r11
+; AVX512F-ONLY-NEXT:    shlq $11, %r11
+; AVX512F-ONLY-NEXT:    orq %rbx, %r11
+; AVX512F-ONLY-NEXT:    movq %r10, %rbx
+; AVX512F-ONLY-NEXT:    shlq $12, %rbx
+; AVX512F-ONLY-NEXT:    orq %r11, %rbx
+; AVX512F-ONLY-NEXT:    movq %r10, %r11
+; AVX512F-ONLY-NEXT:    shlq $13, %r11
+; AVX512F-ONLY-NEXT:    orq %rbx, %r11
+; AVX512F-ONLY-NEXT:    shlq $14, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $15, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $16, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $17, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $18, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    shlq $19, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %r10
+; AVX512F-ONLY-NEXT:    shlq $20, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $21, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %r8, %r10
+; AVX512F-ONLY-NEXT:    shlq $22, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $23, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    shlq $24, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    movzbl %dil, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %rdi
+; AVX512F-ONLY-NEXT:    shlq $25, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $26, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %r9, %rdi
+; AVX512F-ONLY-NEXT:    shlq $27, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $28, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $29, %r9
+; AVX512F-ONLY-NEXT:    orq %r8, %r9
+; AVX512F-ONLY-NEXT:    movzbl %cl, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $30, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $31, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movl %edi, (%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $32, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $33, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $34, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    movzbl %al, %edx
+; AVX512F-ONLY-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $35, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $36, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $37, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $39, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    shrq $32, %rdx
+; AVX512F-ONLY-NEXT:    movb %dl, 4(%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor5_vf8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    kshiftrb $7, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kshiftrb $6, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    kshiftrb $5, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %edi
+; AVX512DQ-NEXT:    kshiftrb $4, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r8d
+; AVX512DQ-NEXT:    kshiftrb $3, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r9d
+; AVX512DQ-NEXT:    kshiftrb $2, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r10d
+; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    kmovw %k0, %r11d
+; AVX512DQ-NEXT:    movzbl %r11b, %ebx
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    leaq (%rbx,%rbx,2), %r11
+; AVX512DQ-NEXT:    leaq (%r11,%rbx,4), %r11
+; AVX512DQ-NEXT:    leaq (%r11,%rbx,8), %r11
+; AVX512DQ-NEXT:    shlq $4, %rbx
+; AVX512DQ-NEXT:    orq %r11, %rbx
+; AVX512DQ-NEXT:    movzbl %dl, %r11d
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    movq %r11, %rdx
+; AVX512DQ-NEXT:    shlq $5, %rdx
+; AVX512DQ-NEXT:    orq %rbx, %rdx
+; AVX512DQ-NEXT:    movq %r11, %rbx
+; AVX512DQ-NEXT:    shlq $6, %rbx
+; AVX512DQ-NEXT:    movq %r11, %r14
+; AVX512DQ-NEXT:    shlq $7, %r14
+; AVX512DQ-NEXT:    orq %rbx, %r14
+; AVX512DQ-NEXT:    movq %r11, %rbx
+; AVX512DQ-NEXT:    shlq $8, %rbx
+; AVX512DQ-NEXT:    orq %r14, %rbx
+; AVX512DQ-NEXT:    shlq $9, %r11
+; AVX512DQ-NEXT:    orq %rbx, %r11
+; AVX512DQ-NEXT:    movzbl %r10b, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rbx
+; AVX512DQ-NEXT:    shlq $10, %rbx
+; AVX512DQ-NEXT:    orq %r11, %rbx
+; AVX512DQ-NEXT:    movq %r10, %r11
+; AVX512DQ-NEXT:    shlq $11, %r11
+; AVX512DQ-NEXT:    orq %rbx, %r11
+; AVX512DQ-NEXT:    movq %r10, %rbx
+; AVX512DQ-NEXT:    shlq $12, %rbx
+; AVX512DQ-NEXT:    orq %r11, %rbx
+; AVX512DQ-NEXT:    movq %r10, %r11
+; AVX512DQ-NEXT:    shlq $13, %r11
+; AVX512DQ-NEXT:    orq %rbx, %r11
+; AVX512DQ-NEXT:    shlq $14, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    movzbl %r9b, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $15, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $16, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $17, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $18, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    shlq $19, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movzbl %r8b, %r8d
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %r10
+; AVX512DQ-NEXT:    shlq $20, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $21, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %r8, %r10
+; AVX512DQ-NEXT:    shlq $22, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $23, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    shlq $24, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    movzbl %dil, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %rdi
+; AVX512DQ-NEXT:    shlq $25, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %r9, %r8
+; AVX512DQ-NEXT:    shlq $26, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %r9, %rdi
+; AVX512DQ-NEXT:    shlq $27, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %r9, %r8
+; AVX512DQ-NEXT:    shlq $28, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $29, %r9
+; AVX512DQ-NEXT:    orq %r8, %r9
+; AVX512DQ-NEXT:    movzbl %cl, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $30, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    shlq $31, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movl %edi, (%rsi)
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $32, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $33, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $34, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    movzbl %al, %edx
+; AVX512DQ-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $35, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $36, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $37, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    shlq $39, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    shrq $32, %rdx
+; AVX512DQ-NEXT:    movb %dl, 4(%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor5_vf8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kmovd %k0, %r11d
+; AVX512BW-NEXT:    movzbl %r11b, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    leaq (%rbx,%rbx,2), %r11
+; AVX512BW-NEXT:    leaq (%r11,%rbx,4), %r11
+; AVX512BW-NEXT:    leaq (%r11,%rbx,8), %r11
+; AVX512BW-NEXT:    shlq $4, %rbx
+; AVX512BW-NEXT:    orq %r11, %rbx
+; AVX512BW-NEXT:    movzbl %dl, %r11d
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rdx
+; AVX512BW-NEXT:    shlq $5, %rdx
+; AVX512BW-NEXT:    orq %rbx, %rdx
+; AVX512BW-NEXT:    movq %r11, %rbx
+; AVX512BW-NEXT:    shlq $6, %rbx
+; AVX512BW-NEXT:    movq %r11, %r14
+; AVX512BW-NEXT:    shlq $7, %r14
+; AVX512BW-NEXT:    orq %rbx, %r14
+; AVX512BW-NEXT:    movq %r11, %rbx
+; AVX512BW-NEXT:    shlq $8, %rbx
+; AVX512BW-NEXT:    orq %r14, %rbx
+; AVX512BW-NEXT:    shlq $9, %r11
+; AVX512BW-NEXT:    orq %rbx, %r11
+; AVX512BW-NEXT:    movzbl %r10b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %rbx
+; AVX512BW-NEXT:    shlq $10, %rbx
+; AVX512BW-NEXT:    orq %r11, %rbx
+; AVX512BW-NEXT:    movq %r10, %r11
+; AVX512BW-NEXT:    shlq $11, %r11
+; AVX512BW-NEXT:    orq %rbx, %r11
+; AVX512BW-NEXT:    movq %r10, %rbx
+; AVX512BW-NEXT:    shlq $12, %rbx
+; AVX512BW-NEXT:    orq %r11, %rbx
+; AVX512BW-NEXT:    movq %r10, %r11
+; AVX512BW-NEXT:    shlq $13, %r11
+; AVX512BW-NEXT:    orq %rbx, %r11
+; AVX512BW-NEXT:    shlq $14, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    movzbl %r9b, %r9d
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $15, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $16, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $17, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $18, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    shlq $19, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movzbl %r8b, %r8d
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %r10
+; AVX512BW-NEXT:    shlq $20, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $21, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movq %r8, %r10
+; AVX512BW-NEXT:    shlq $22, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $23, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    shlq $24, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    movzbl %dil, %r9d
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %rdi
+; AVX512BW-NEXT:    shlq $25, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    movq %r9, %r8
+; AVX512BW-NEXT:    shlq $26, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %r9, %rdi
+; AVX512BW-NEXT:    shlq $27, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    movq %r9, %r8
+; AVX512BW-NEXT:    shlq $28, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    shlq $29, %r9
+; AVX512BW-NEXT:    orq %r8, %r9
+; AVX512BW-NEXT:    movzbl %cl, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $30, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    shlq $31, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movl %edi, (%rsi)
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $32, %rdx
+; AVX512BW-NEXT:    movq %rcx, %r8
+; AVX512BW-NEXT:    shlq $33, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    shlq $34, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    movzbl %al, %edx
+; AVX512BW-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $35, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $36, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $37, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    shlq $39, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    shrq $32, %rdx
+; AVX512BW-NEXT:    movb %dl, 4(%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    retq
+  %src.vec = load <8 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
+  store <40 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor5_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor5_vf16:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r13
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    movl %edx, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    leal (%r14,%r11,2), %ebx
+; AVX512F-ONLY-NEXT:    leal (%rbx,%r11,4), %ebx
+; AVX512F-ONLY-NEXT:    leal (%rbx,%r11,8), %ebx
+; AVX512F-ONLY-NEXT:    movl %r11d, %r12d
+; AVX512F-ONLY-NEXT:    shll $4, %r12d
+; AVX512F-ONLY-NEXT:    orl %ebx, %r12d
+; AVX512F-ONLY-NEXT:    shll $5, %r11d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r11d
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movl %ebp, %ebx
+; AVX512F-ONLY-NEXT:    shll $6, %ebx
+; AVX512F-ONLY-NEXT:    movl %ebp, %r12d
+; AVX512F-ONLY-NEXT:    shll $7, %r12d
+; AVX512F-ONLY-NEXT:    orl %ebx, %r12d
+; AVX512F-ONLY-NEXT:    movl %ebp, %ebx
+; AVX512F-ONLY-NEXT:    shll $8, %ebx
+; AVX512F-ONLY-NEXT:    orl %r12d, %ebx
+; AVX512F-ONLY-NEXT:    movl %ebp, %r12d
+; AVX512F-ONLY-NEXT:    shll $9, %r12d
+; AVX512F-ONLY-NEXT:    orl %ebx, %r12d
+; AVX512F-ONLY-NEXT:    shll $10, %ebp
+; AVX512F-ONLY-NEXT:    orl %r12d, %ebp
+; AVX512F-ONLY-NEXT:    movl %eax, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movl %ebx, %r12d
+; AVX512F-ONLY-NEXT:    shll $11, %r12d
+; AVX512F-ONLY-NEXT:    orl %ebp, %r12d
+; AVX512F-ONLY-NEXT:    movl %ebx, %ebp
+; AVX512F-ONLY-NEXT:    shll $12, %ebp
+; AVX512F-ONLY-NEXT:    orl %r12d, %ebp
+; AVX512F-ONLY-NEXT:    movl %ebx, %r13d
+; AVX512F-ONLY-NEXT:    shll $13, %r13d
+; AVX512F-ONLY-NEXT:    orl %ebp, %r13d
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k1
+; AVX512F-ONLY-NEXT:    shll $14, %ebx
+; AVX512F-ONLY-NEXT:    orl %r13d, %ebx
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512F-ONLY-NEXT:    shll $15, %eax
+; AVX512F-ONLY-NEXT:    orl %ebx, %eax
+; AVX512F-ONLY-NEXT:    orl %r11d, %eax
+; AVX512F-ONLY-NEXT:    movw %ax, 8(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
+; AVX512F-ONLY-NEXT:    movzbl %al, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r11
+; AVX512F-ONLY-NEXT:    leaq (%r11,%rax,4), %r11
+; AVX512F-ONLY-NEXT:    leaq (%r11,%rax,8), %r11
+; AVX512F-ONLY-NEXT:    shlq $4, %rax
+; AVX512F-ONLY-NEXT:    orq %r11, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %ebp
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movq %rbp, %r10
+; AVX512F-ONLY-NEXT:    shlq $5, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %r11
+; AVX512F-ONLY-NEXT:    shlq $7, %r11
+; AVX512F-ONLY-NEXT:    orq %rax, %r11
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r11, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512F-ONLY-NEXT:    shlq $9, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %rbp, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r9
+; AVX512F-ONLY-NEXT:    shlq $11, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %rbp
+; AVX512F-ONLY-NEXT:    shlq $13, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r9d
+; AVX512F-ONLY-NEXT:    movzbl %dl, %edx
+; AVX512F-ONLY-NEXT:    shlq $14, %rbx
+; AVX512F-ONLY-NEXT:    orq %rbp, %rbx
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $15, %r8
+; AVX512F-ONLY-NEXT:    orq %rbx, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $16, %rbx
+; AVX512F-ONLY-NEXT:    orq %r8, %rbx
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $17, %r8
+; AVX512F-ONLY-NEXT:    orq %rbx, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $18, %rbx
+; AVX512F-ONLY-NEXT:    orq %r8, %rbx
+; AVX512F-ONLY-NEXT:    shlq $19, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $20, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $21, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $22, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $24, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $25, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $26, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $27, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $28, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    shlq $29, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movzbl %cl, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $30, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $31, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $33, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $34, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $36, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $37, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $38, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $39, %r15
+; AVX512F-ONLY-NEXT:    orq %rcx, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $41, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $43, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $44, %r12
+; AVX512F-ONLY-NEXT:    orq %rcx, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $45, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $46, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $48, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $49, %r13
+; AVX512F-ONLY-NEXT:    orq %rcx, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %rcx
+; AVX512F-ONLY-NEXT:    shlq $51, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %rcx
+; AVX512F-ONLY-NEXT:    shlq $53, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $54, %r11
+; AVX512F-ONLY-NEXT:    orq %rcx, %r11
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %r11, %rax
+; AVX512F-ONLY-NEXT:    movq %r9, %rcx
+; AVX512F-ONLY-NEXT:    shlq $56, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r9, %rcx
+; AVX512F-ONLY-NEXT:    shlq $58, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $59, %r9
+; AVX512F-ONLY-NEXT:    orq %rcx, %r9
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rcx
+; AVX512F-ONLY-NEXT:    shlq $61, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $62, %r14
+; AVX512F-ONLY-NEXT:    orq %rcx, %r14
+; AVX512F-ONLY-NEXT:    shlq $63, %rdx
+; AVX512F-ONLY-NEXT:    orq %r14, %rdx
+; AVX512F-ONLY-NEXT:    orq %r10, %rdx
+; AVX512F-ONLY-NEXT:    movq %rdx, (%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r13
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor5_vf16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %edi
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r8d
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r9d
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r10d
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r11d
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r15d
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    movl %edx, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    leal (%r14,%r11,2), %ebx
+; AVX512DQ-NEXT:    leal (%rbx,%r11,4), %ebx
+; AVX512DQ-NEXT:    leal (%rbx,%r11,8), %ebx
+; AVX512DQ-NEXT:    movl %r11d, %r12d
+; AVX512DQ-NEXT:    shll $4, %r12d
+; AVX512DQ-NEXT:    orl %ebx, %r12d
+; AVX512DQ-NEXT:    shll $5, %r11d
+; AVX512DQ-NEXT:    orl %r12d, %r11d
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movl %ebp, %ebx
+; AVX512DQ-NEXT:    shll $6, %ebx
+; AVX512DQ-NEXT:    movl %ebp, %r12d
+; AVX512DQ-NEXT:    shll $7, %r12d
+; AVX512DQ-NEXT:    orl %ebx, %r12d
+; AVX512DQ-NEXT:    movl %ebp, %ebx
+; AVX512DQ-NEXT:    shll $8, %ebx
+; AVX512DQ-NEXT:    orl %r12d, %ebx
+; AVX512DQ-NEXT:    movl %ebp, %r12d
+; AVX512DQ-NEXT:    shll $9, %r12d
+; AVX512DQ-NEXT:    orl %ebx, %r12d
+; AVX512DQ-NEXT:    shll $10, %ebp
+; AVX512DQ-NEXT:    orl %r12d, %ebp
+; AVX512DQ-NEXT:    movl %eax, %ebx
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movl %ebx, %r12d
+; AVX512DQ-NEXT:    shll $11, %r12d
+; AVX512DQ-NEXT:    orl %ebp, %r12d
+; AVX512DQ-NEXT:    movl %ebx, %ebp
+; AVX512DQ-NEXT:    shll $12, %ebp
+; AVX512DQ-NEXT:    orl %r12d, %ebp
+; AVX512DQ-NEXT:    movl %ebx, %r13d
+; AVX512DQ-NEXT:    shll $13, %r13d
+; AVX512DQ-NEXT:    orl %ebp, %r13d
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k1
+; AVX512DQ-NEXT:    shll $14, %ebx
+; AVX512DQ-NEXT:    orl %r13d, %ebx
+; AVX512DQ-NEXT:    kmovw %k1, %r13d
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512DQ-NEXT:    shll $15, %eax
+; AVX512DQ-NEXT:    orl %ebx, %eax
+; AVX512DQ-NEXT:    orl %r11d, %eax
+; AVX512DQ-NEXT:    movw %ax, 8(%rsi)
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    movzbl %al, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r11
+; AVX512DQ-NEXT:    leaq (%r11,%rax,4), %r11
+; AVX512DQ-NEXT:    leaq (%r11,%rax,8), %r11
+; AVX512DQ-NEXT:    shlq $4, %rax
+; AVX512DQ-NEXT:    orq %r11, %rax
+; AVX512DQ-NEXT:    movzbl %r10b, %ebp
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movq %rbp, %r10
+; AVX512DQ-NEXT:    shlq $5, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movq %rbp, %r11
+; AVX512DQ-NEXT:    shlq $7, %r11
+; AVX512DQ-NEXT:    orq %rax, %r11
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r11, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %r11d
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512DQ-NEXT:    shlq $9, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    movzbl %r9b, %ebx
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %rbp, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r9
+; AVX512DQ-NEXT:    shlq $11, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %rbx, %rbp
+; AVX512DQ-NEXT:    shlq $13, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k0, %r9d
+; AVX512DQ-NEXT:    movzbl %dl, %edx
+; AVX512DQ-NEXT:    shlq $14, %rbx
+; AVX512DQ-NEXT:    orq %rbp, %rbx
+; AVX512DQ-NEXT:    movzbl %r8b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $15, %r8
+; AVX512DQ-NEXT:    orq %rbx, %r8
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $16, %rbx
+; AVX512DQ-NEXT:    orq %r8, %rbx
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $17, %r8
+; AVX512DQ-NEXT:    orq %rbx, %r8
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $18, %rbx
+; AVX512DQ-NEXT:    orq %r8, %rbx
+; AVX512DQ-NEXT:    shlq $19, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movzbl %dil, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $20, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $21, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $22, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    shlq $24, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $25, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $26, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $27, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $28, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    shlq $29, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movzbl %cl, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $30, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    shlq $31, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $33, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $34, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $36, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $37, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $38, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $39, %r15
+; AVX512DQ-NEXT:    orq %rcx, %r15
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $41, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $43, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $44, %r12
+; AVX512DQ-NEXT:    orq %rcx, %r12
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $45, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $46, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $48, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $49, %r13
+; AVX512DQ-NEXT:    orq %rcx, %r13
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r11, %rcx
+; AVX512DQ-NEXT:    shlq $51, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r11, %rcx
+; AVX512DQ-NEXT:    shlq $53, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $54, %r11
+; AVX512DQ-NEXT:    orq %rcx, %r11
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %r11, %rax
+; AVX512DQ-NEXT:    movq %r9, %rcx
+; AVX512DQ-NEXT:    shlq $56, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r9, %rcx
+; AVX512DQ-NEXT:    shlq $58, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $59, %r9
+; AVX512DQ-NEXT:    orq %rcx, %r9
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $60, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %r14, %rcx
+; AVX512DQ-NEXT:    shlq $61, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $62, %r14
+; AVX512DQ-NEXT:    orq %rcx, %r14
+; AVX512DQ-NEXT:    shlq $63, %rdx
+; AVX512DQ-NEXT:    orq %r14, %rdx
+; AVX512DQ-NEXT:    orq %r10, %rdx
+; AVX512DQ-NEXT:    movq %rdx, (%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor5_vf16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r13
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movl %edx, %r14d
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    leal (%r14,%r11,2), %ebx
+; AVX512BW-NEXT:    leal (%rbx,%r11,4), %ebx
+; AVX512BW-NEXT:    leal (%rbx,%r11,8), %ebx
+; AVX512BW-NEXT:    movl %r11d, %r12d
+; AVX512BW-NEXT:    shll $4, %r12d
+; AVX512BW-NEXT:    orl %ebx, %r12d
+; AVX512BW-NEXT:    shll $5, %r11d
+; AVX512BW-NEXT:    orl %r12d, %r11d
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movl %ebp, %ebx
+; AVX512BW-NEXT:    shll $6, %ebx
+; AVX512BW-NEXT:    movl %ebp, %r12d
+; AVX512BW-NEXT:    shll $7, %r12d
+; AVX512BW-NEXT:    orl %ebx, %r12d
+; AVX512BW-NEXT:    movl %ebp, %ebx
+; AVX512BW-NEXT:    shll $8, %ebx
+; AVX512BW-NEXT:    orl %r12d, %ebx
+; AVX512BW-NEXT:    movl %ebp, %r12d
+; AVX512BW-NEXT:    shll $9, %r12d
+; AVX512BW-NEXT:    orl %ebx, %r12d
+; AVX512BW-NEXT:    shll $10, %ebp
+; AVX512BW-NEXT:    orl %r12d, %ebp
+; AVX512BW-NEXT:    movl %eax, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movl %ebx, %r12d
+; AVX512BW-NEXT:    shll $11, %r12d
+; AVX512BW-NEXT:    orl %ebp, %r12d
+; AVX512BW-NEXT:    movl %ebx, %ebp
+; AVX512BW-NEXT:    shll $12, %ebp
+; AVX512BW-NEXT:    orl %r12d, %ebp
+; AVX512BW-NEXT:    movl %ebx, %r13d
+; AVX512BW-NEXT:    shll $13, %r13d
+; AVX512BW-NEXT:    orl %ebp, %r13d
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k1
+; AVX512BW-NEXT:    shll $14, %ebx
+; AVX512BW-NEXT:    orl %r13d, %ebx
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512BW-NEXT:    shll $15, %eax
+; AVX512BW-NEXT:    orl %ebx, %eax
+; AVX512BW-NEXT:    orl %r11d, %eax
+; AVX512BW-NEXT:    movw %ax, 8(%rsi)
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movzbl %al, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r11
+; AVX512BW-NEXT:    leaq (%r11,%rax,4), %r11
+; AVX512BW-NEXT:    leaq (%r11,%rax,8), %r11
+; AVX512BW-NEXT:    shlq $4, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movzbl %r10b, %ebp
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %r10
+; AVX512BW-NEXT:    shlq $5, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %rbp, %r11
+; AVX512BW-NEXT:    shlq $7, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512BW-NEXT:    shlq $9, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    movzbl %r9b, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %rbx, %r9
+; AVX512BW-NEXT:    shlq $11, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %rbx, %rbp
+; AVX512BW-NEXT:    shlq $13, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k0, %r9d
+; AVX512BW-NEXT:    movzbl %dl, %edx
+; AVX512BW-NEXT:    shlq $14, %rbx
+; AVX512BW-NEXT:    orq %rbp, %rbx
+; AVX512BW-NEXT:    movzbl %r8b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $15, %r8
+; AVX512BW-NEXT:    orq %rbx, %r8
+; AVX512BW-NEXT:    movq %rax, %rbx
+; AVX512BW-NEXT:    shlq $16, %rbx
+; AVX512BW-NEXT:    orq %r8, %rbx
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $17, %r8
+; AVX512BW-NEXT:    orq %rbx, %r8
+; AVX512BW-NEXT:    movq %rax, %rbx
+; AVX512BW-NEXT:    shlq $18, %rbx
+; AVX512BW-NEXT:    orq %r8, %rbx
+; AVX512BW-NEXT:    shlq $19, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movzbl %dil, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $20, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $21, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $22, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $23, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    shlq $24, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $25, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $26, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $27, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $28, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    shlq $29, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movzbl %cl, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $30, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    shlq $31, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $33, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    shlq $34, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $36, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $37, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $38, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $39, %r15
+; AVX512BW-NEXT:    orq %rcx, %r15
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $41, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $43, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $44, %r12
+; AVX512BW-NEXT:    orq %rcx, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $45, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $46, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $47, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $48, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $49, %r13
+; AVX512BW-NEXT:    orq %rcx, %r13
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r11, %rcx
+; AVX512BW-NEXT:    shlq $51, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r11, %rcx
+; AVX512BW-NEXT:    shlq $53, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $54, %r11
+; AVX512BW-NEXT:    orq %rcx, %r11
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $55, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r9, %rcx
+; AVX512BW-NEXT:    shlq $56, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $57, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r9, %rcx
+; AVX512BW-NEXT:    shlq $58, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $59, %r9
+; AVX512BW-NEXT:    orq %rcx, %r9
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r14, %rcx
+; AVX512BW-NEXT:    shlq $61, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $62, %r14
+; AVX512BW-NEXT:    orq %rcx, %r14
+; AVX512BW-NEXT:    shlq $63, %rdx
+; AVX512BW-NEXT:    orq %r14, %rdx
+; AVX512BW-NEXT:    orq %r10, %rdx
+; AVX512BW-NEXT:    movq %rdx, (%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r13
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <16 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
+  store <80 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor5_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor5_vf32:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r13
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k2
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movl %ecx, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    leal (%r9,%r9,2), %r13d
+; AVX512F-ONLY-NEXT:    leal (%r13,%r14,4), %r13d
+; AVX512F-ONLY-NEXT:    leal (%r13,%r14,8), %ebp
+; AVX512F-ONLY-NEXT:    movl %r14d, %r13d
+; AVX512F-ONLY-NEXT:    shll $4, %r13d
+; AVX512F-ONLY-NEXT:    orl %ebp, %r13d
+; AVX512F-ONLY-NEXT:    movl %r14d, %ebp
+; AVX512F-ONLY-NEXT:    shll $5, %ebp
+; AVX512F-ONLY-NEXT:    orl %r13d, %ebp
+; AVX512F-ONLY-NEXT:    shll $6, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
+; AVX512F-ONLY-NEXT:    shll $7, %r13d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r13d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r14d
+; AVX512F-ONLY-NEXT:    shll $8, %r14d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
+; AVX512F-ONLY-NEXT:    shll $9, %r13d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r13d
+; AVX512F-ONLY-NEXT:    movl %r15d, %eax
+; AVX512F-ONLY-NEXT:    shll $10, %eax
+; AVX512F-ONLY-NEXT:    orl %r13d, %eax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k2
+; AVX512F-ONLY-NEXT:    shll $11, %r15d
+; AVX512F-ONLY-NEXT:    orl %eax, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movl %r12d, %eax
+; AVX512F-ONLY-NEXT:    shll $12, %eax
+; AVX512F-ONLY-NEXT:    orl %r15d, %eax
+; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
+; AVX512F-ONLY-NEXT:    shll $13, %r15d
+; AVX512F-ONLY-NEXT:    orl %eax, %r15d
+; AVX512F-ONLY-NEXT:    movl %r12d, %eax
+; AVX512F-ONLY-NEXT:    shll $14, %eax
+; AVX512F-ONLY-NEXT:    orl %r15d, %eax
+; AVX512F-ONLY-NEXT:    movl %r12d, %r13d
+; AVX512F-ONLY-NEXT:    shll $15, %r13d
+; AVX512F-ONLY-NEXT:    orl %eax, %r13d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k2
+; AVX512F-ONLY-NEXT:    shll $16, %r12d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movl %r14d, %eax
+; AVX512F-ONLY-NEXT:    shll $17, %eax
+; AVX512F-ONLY-NEXT:    orl %r12d, %eax
+; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
+; AVX512F-ONLY-NEXT:    shll $18, %r12d
+; AVX512F-ONLY-NEXT:    orl %eax, %r12d
+; AVX512F-ONLY-NEXT:    movl %r14d, %eax
+; AVX512F-ONLY-NEXT:    shll $19, %eax
+; AVX512F-ONLY-NEXT:    orl %r12d, %eax
+; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
+; AVX512F-ONLY-NEXT:    shll $20, %r12d
+; AVX512F-ONLY-NEXT:    orl %eax, %r12d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512F-ONLY-NEXT:    shll $21, %r14d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    shll $22, %r12d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r12d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r14d
+; AVX512F-ONLY-NEXT:    shll $23, %r14d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    shll $24, %r12d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r12d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r14d
+; AVX512F-ONLY-NEXT:    shll $25, %r14d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
+; AVX512F-ONLY-NEXT:    shll $26, %r15d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r15d
+; AVX512F-ONLY-NEXT:    movl %eax, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
+; AVX512F-ONLY-NEXT:    shll $27, %r12d
+; AVX512F-ONLY-NEXT:    orl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    movl %r14d, %r15d
+; AVX512F-ONLY-NEXT:    shll $28, %r15d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r15d
+; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
+; AVX512F-ONLY-NEXT:    shll $29, %r12d
+; AVX512F-ONLY-NEXT:    orl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512F-ONLY-NEXT:    shll $30, %r14d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r14d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512F-ONLY-NEXT:    shll $31, %eax
+; AVX512F-ONLY-NEXT:    orl %r14d, %eax
+; AVX512F-ONLY-NEXT:    orl %ebp, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, 16(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    movzbl %al, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r14
+; AVX512F-ONLY-NEXT:    leaq (%r14,%rax,4), %r14
+; AVX512F-ONLY-NEXT:    leaq (%r14,%rax,8), %r14
+; AVX512F-ONLY-NEXT:    shlq $4, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r14
+; AVX512F-ONLY-NEXT:    shlq $5, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $9, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $11, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $13, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $14, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $15, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $16, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $17, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $18, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $19, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $20, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $21, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $22, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $23, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $24, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $25, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $26, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $27, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $28, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $29, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $30, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $32, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $34, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $36, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $37, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $38, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $39, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $43, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $44, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $45, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $46, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $48, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k1
+; AVX512F-ONLY-NEXT:    shlq $49, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $53, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $54, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $56, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $58, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $59, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r15d
+; AVX512F-ONLY-NEXT:    movl %ebp, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $61, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $62, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $63, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512F-ONLY-NEXT:    orq %r14, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r13, (%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %bl, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rbx,2), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,4), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,8), %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r15
+; AVX512F-ONLY-NEXT:    shlq $4, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $5, %rbx
+; AVX512F-ONLY-NEXT:    orq %r15, %rbx
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r11
+; AVX512F-ONLY-NEXT:    shlq $6, %r11
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $7, %r15
+; AVX512F-ONLY-NEXT:    orq %r11, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r11
+; AVX512F-ONLY-NEXT:    shlq $8, %r11
+; AVX512F-ONLY-NEXT:    orq %r15, %r11
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $9, %r13
+; AVX512F-ONLY-NEXT:    orq %r11, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %r11d
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    movq %r11, %r10
+; AVX512F-ONLY-NEXT:    shlq $11, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %r10
+; AVX512F-ONLY-NEXT:    shlq $13, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r10d
+; AVX512F-ONLY-NEXT:    shlq $15, %r11
+; AVX512F-ONLY-NEXT:    orq %rax, %r11
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $16, %r15
+; AVX512F-ONLY-NEXT:    orq %r11, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r11
+; AVX512F-ONLY-NEXT:    shlq $17, %r11
+; AVX512F-ONLY-NEXT:    orq %r15, %r11
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $18, %r15
+; AVX512F-ONLY-NEXT:    orq %r11, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r11
+; AVX512F-ONLY-NEXT:    shlq $19, %r11
+; AVX512F-ONLY-NEXT:    orq %r15, %r11
+; AVX512F-ONLY-NEXT:    shlq $20, %rax
+; AVX512F-ONLY-NEXT:    orq %r11, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %r11
+; AVX512F-ONLY-NEXT:    shlq $21, %r11
+; AVX512F-ONLY-NEXT:    orq %rax, %r11
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $22, %rax
+; AVX512F-ONLY-NEXT:    orq %r11, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %r11
+; AVX512F-ONLY-NEXT:    shlq $23, %r11
+; AVX512F-ONLY-NEXT:    orq %rax, %r11
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $24, %rax
+; AVX512F-ONLY-NEXT:    orq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $25, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movzbl %dil, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $26, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $27, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $28, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $29, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $30, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movzbl %dl, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $31, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $33, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %rdx
+; AVX512F-ONLY-NEXT:    shlq $37, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %rdx
+; AVX512F-ONLY-NEXT:    shlq $39, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $40, %rbp
+; AVX512F-ONLY-NEXT:    orq %rdx, %rbp
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $41, %rax
+; AVX512F-ONLY-NEXT:    orq %rbp, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rdx
+; AVX512F-ONLY-NEXT:    shlq $42, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $43, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rdx
+; AVX512F-ONLY-NEXT:    shlq $44, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $45, %r12
+; AVX512F-ONLY-NEXT:    orq %rdx, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rdx
+; AVX512F-ONLY-NEXT:    shlq $47, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rdx
+; AVX512F-ONLY-NEXT:    shlq $49, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $50, %r14
+; AVX512F-ONLY-NEXT:    orq %rdx, %r14
+; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $51, %rdx
+; AVX512F-ONLY-NEXT:    orq %r14, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $52, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $53, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $54, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rdx
+; AVX512F-ONLY-NEXT:    shlq $56, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %rdx
+; AVX512F-ONLY-NEXT:    shlq $58, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $59, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $61, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $62, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movzbl %cl, %eax
+; AVX512F-ONLY-NEXT:    shlq $63, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %rax, 8(%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r13
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor5_vf32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %ecx
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %edx
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %edi
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r8d
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r10d
+; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r11d
+; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %ebx
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k2
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movl %ecx, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    leal (%r9,%r9,2), %r13d
+; AVX512DQ-NEXT:    leal (%r13,%r14,4), %r13d
+; AVX512DQ-NEXT:    leal (%r13,%r14,8), %ebp
+; AVX512DQ-NEXT:    movl %r14d, %r13d
+; AVX512DQ-NEXT:    shll $4, %r13d
+; AVX512DQ-NEXT:    orl %ebp, %r13d
+; AVX512DQ-NEXT:    movl %r14d, %ebp
+; AVX512DQ-NEXT:    shll $5, %ebp
+; AVX512DQ-NEXT:    orl %r13d, %ebp
+; AVX512DQ-NEXT:    shll $6, %r14d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movl %r15d, %r13d
+; AVX512DQ-NEXT:    shll $7, %r13d
+; AVX512DQ-NEXT:    orl %r14d, %r13d
+; AVX512DQ-NEXT:    movl %r15d, %r14d
+; AVX512DQ-NEXT:    shll $8, %r14d
+; AVX512DQ-NEXT:    orl %r13d, %r14d
+; AVX512DQ-NEXT:    movl %r15d, %r13d
+; AVX512DQ-NEXT:    shll $9, %r13d
+; AVX512DQ-NEXT:    orl %r14d, %r13d
+; AVX512DQ-NEXT:    movl %r15d, %eax
+; AVX512DQ-NEXT:    shll $10, %eax
+; AVX512DQ-NEXT:    orl %r13d, %eax
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k2
+; AVX512DQ-NEXT:    shll $11, %r15d
+; AVX512DQ-NEXT:    orl %eax, %r15d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movl %r12d, %eax
+; AVX512DQ-NEXT:    shll $12, %eax
+; AVX512DQ-NEXT:    orl %r15d, %eax
+; AVX512DQ-NEXT:    movl %r12d, %r15d
+; AVX512DQ-NEXT:    shll $13, %r15d
+; AVX512DQ-NEXT:    orl %eax, %r15d
+; AVX512DQ-NEXT:    movl %r12d, %eax
+; AVX512DQ-NEXT:    shll $14, %eax
+; AVX512DQ-NEXT:    orl %r15d, %eax
+; AVX512DQ-NEXT:    movl %r12d, %r13d
+; AVX512DQ-NEXT:    shll $15, %r13d
+; AVX512DQ-NEXT:    orl %eax, %r13d
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k2
+; AVX512DQ-NEXT:    shll $16, %r12d
+; AVX512DQ-NEXT:    orl %r13d, %r12d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movl %r14d, %eax
+; AVX512DQ-NEXT:    shll $17, %eax
+; AVX512DQ-NEXT:    orl %r12d, %eax
+; AVX512DQ-NEXT:    movl %r14d, %r12d
+; AVX512DQ-NEXT:    shll $18, %r12d
+; AVX512DQ-NEXT:    orl %eax, %r12d
+; AVX512DQ-NEXT:    movl %r14d, %eax
+; AVX512DQ-NEXT:    shll $19, %eax
+; AVX512DQ-NEXT:    orl %r12d, %eax
+; AVX512DQ-NEXT:    movl %r14d, %r12d
+; AVX512DQ-NEXT:    shll $20, %r12d
+; AVX512DQ-NEXT:    orl %eax, %r12d
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512DQ-NEXT:    shll $21, %r14d
+; AVX512DQ-NEXT:    orl %r12d, %r14d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movl %r15d, %r12d
+; AVX512DQ-NEXT:    shll $22, %r12d
+; AVX512DQ-NEXT:    orl %r14d, %r12d
+; AVX512DQ-NEXT:    movl %r15d, %r14d
+; AVX512DQ-NEXT:    shll $23, %r14d
+; AVX512DQ-NEXT:    orl %r12d, %r14d
+; AVX512DQ-NEXT:    movl %r15d, %r12d
+; AVX512DQ-NEXT:    shll $24, %r12d
+; AVX512DQ-NEXT:    orl %r14d, %r12d
+; AVX512DQ-NEXT:    movl %r15d, %r14d
+; AVX512DQ-NEXT:    shll $25, %r14d
+; AVX512DQ-NEXT:    orl %r12d, %r14d
+; AVX512DQ-NEXT:    shll $26, %r15d
+; AVX512DQ-NEXT:    orl %r14d, %r15d
+; AVX512DQ-NEXT:    movl %eax, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movl %r14d, %r12d
+; AVX512DQ-NEXT:    shll $27, %r12d
+; AVX512DQ-NEXT:    orl %r15d, %r12d
+; AVX512DQ-NEXT:    movl %r14d, %r15d
+; AVX512DQ-NEXT:    shll $28, %r15d
+; AVX512DQ-NEXT:    orl %r12d, %r15d
+; AVX512DQ-NEXT:    movl %r14d, %r12d
+; AVX512DQ-NEXT:    shll $29, %r12d
+; AVX512DQ-NEXT:    orl %r15d, %r12d
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512DQ-NEXT:    shll $30, %r14d
+; AVX512DQ-NEXT:    orl %r12d, %r14d
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512DQ-NEXT:    shll $31, %eax
+; AVX512DQ-NEXT:    orl %r14d, %eax
+; AVX512DQ-NEXT:    orl %ebp, %eax
+; AVX512DQ-NEXT:    movl %eax, 16(%rsi)
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    movzbl %al, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r14
+; AVX512DQ-NEXT:    leaq (%r14,%rax,4), %r14
+; AVX512DQ-NEXT:    leaq (%r14,%rax,8), %r14
+; AVX512DQ-NEXT:    shlq $4, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movzbl %r12b, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r14
+; AVX512DQ-NEXT:    shlq $5, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $7, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512DQ-NEXT:    shlq $9, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $11, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $13, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512DQ-NEXT:    shlq $14, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $15, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $16, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $17, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $18, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512DQ-NEXT:    shlq $19, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $20, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $21, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $22, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $23, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512DQ-NEXT:    shlq $24, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $25, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $26, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $27, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $28, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $29, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %r15b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $30, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $32, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512DQ-NEXT:    shlq $34, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $36, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $37, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $38, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512DQ-NEXT:    shlq $39, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $43, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512DQ-NEXT:    shlq $44, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $45, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $46, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $48, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k1
+; AVX512DQ-NEXT:    shlq $49, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $53, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k1, %r13d
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512DQ-NEXT:    shlq $54, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $56, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $58, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    shlq $59, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r15d
+; AVX512DQ-NEXT:    movl %ebp, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $60, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $61, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512DQ-NEXT:    shlq $62, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $63, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512DQ-NEXT:    orq %r14, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %r14d
+; AVX512DQ-NEXT:    movq %r13, (%rsi)
+; AVX512DQ-NEXT:    movzbl %bl, %ebx
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    leaq (%r15,%rbx,2), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%rbx,4), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%rbx,8), %rax
+; AVX512DQ-NEXT:    movq %rbx, %r15
+; AVX512DQ-NEXT:    shlq $4, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    shlq $5, %rbx
+; AVX512DQ-NEXT:    orq %r15, %rbx
+; AVX512DQ-NEXT:    movzbl %r11b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r11
+; AVX512DQ-NEXT:    shlq $6, %r11
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $7, %r15
+; AVX512DQ-NEXT:    orq %r11, %r15
+; AVX512DQ-NEXT:    movq %rax, %r11
+; AVX512DQ-NEXT:    shlq $8, %r11
+; AVX512DQ-NEXT:    orq %r15, %r11
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $9, %r13
+; AVX512DQ-NEXT:    orq %r11, %r13
+; AVX512DQ-NEXT:    kmovw %k0, %r15d
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movzbl %r10b, %r11d
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    movq %r11, %r10
+; AVX512DQ-NEXT:    shlq $11, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r11, %r10
+; AVX512DQ-NEXT:    shlq $13, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    kmovw %k0, %r10d
+; AVX512DQ-NEXT:    shlq $15, %r11
+; AVX512DQ-NEXT:    orq %rax, %r11
+; AVX512DQ-NEXT:    movzbl %r15b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $16, %r15
+; AVX512DQ-NEXT:    orq %r11, %r15
+; AVX512DQ-NEXT:    movq %rax, %r11
+; AVX512DQ-NEXT:    shlq $17, %r11
+; AVX512DQ-NEXT:    orq %r15, %r11
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $18, %r15
+; AVX512DQ-NEXT:    orq %r11, %r15
+; AVX512DQ-NEXT:    movq %rax, %r11
+; AVX512DQ-NEXT:    shlq $19, %r11
+; AVX512DQ-NEXT:    orq %r15, %r11
+; AVX512DQ-NEXT:    shlq $20, %rax
+; AVX512DQ-NEXT:    orq %r11, %rax
+; AVX512DQ-NEXT:    movzbl %r8b, %r8d
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %r11
+; AVX512DQ-NEXT:    shlq $21, %r11
+; AVX512DQ-NEXT:    orq %rax, %r11
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $22, %rax
+; AVX512DQ-NEXT:    orq %r11, %rax
+; AVX512DQ-NEXT:    movq %r8, %r11
+; AVX512DQ-NEXT:    shlq $23, %r11
+; AVX512DQ-NEXT:    orq %rax, %r11
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $24, %rax
+; AVX512DQ-NEXT:    orq %r11, %rax
+; AVX512DQ-NEXT:    shlq $25, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movzbl %dil, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $26, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $27, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $28, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $29, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $30, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movzbl %dl, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $31, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $33, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $35, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %rbp, %rdx
+; AVX512DQ-NEXT:    shlq $37, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %rbp, %rdx
+; AVX512DQ-NEXT:    shlq $39, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $40, %rbp
+; AVX512DQ-NEXT:    orq %rdx, %rbp
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $41, %rax
+; AVX512DQ-NEXT:    orq %rbp, %rax
+; AVX512DQ-NEXT:    movq %r12, %rdx
+; AVX512DQ-NEXT:    shlq $42, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $43, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rdx
+; AVX512DQ-NEXT:    shlq $44, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $45, %r12
+; AVX512DQ-NEXT:    orq %rdx, %r12
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %rdx
+; AVX512DQ-NEXT:    shlq $47, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %r14, %rdx
+; AVX512DQ-NEXT:    shlq $49, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $50, %r14
+; AVX512DQ-NEXT:    orq %rdx, %r14
+; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $51, %rdx
+; AVX512DQ-NEXT:    orq %r14, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $52, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $53, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $54, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rdx
+; AVX512DQ-NEXT:    shlq $56, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %r10, %rdx
+; AVX512DQ-NEXT:    shlq $58, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $59, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $60, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $61, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    shlq $62, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movzbl %cl, %eax
+; AVX512DQ-NEXT:    shlq $63, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %rax, 8(%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor5_vf32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r13
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrd $25, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrd $19, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrd $18, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrd $17, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrd $15, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrd $14, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrd $13, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrd $26, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $27, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $28, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $29, %k0, %k1
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movl %ecx, %r9d
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    leal (%r9,%r9,2), %r13d
+; AVX512BW-NEXT:    leal (%r13,%r14,4), %r13d
+; AVX512BW-NEXT:    leal (%r13,%r14,8), %ebp
+; AVX512BW-NEXT:    movl %r14d, %r13d
+; AVX512BW-NEXT:    shll $4, %r13d
+; AVX512BW-NEXT:    orl %ebp, %r13d
+; AVX512BW-NEXT:    movl %r14d, %ebp
+; AVX512BW-NEXT:    shll $5, %ebp
+; AVX512BW-NEXT:    orl %r13d, %ebp
+; AVX512BW-NEXT:    shll $6, %r14d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movl %r15d, %r13d
+; AVX512BW-NEXT:    shll $7, %r13d
+; AVX512BW-NEXT:    orl %r14d, %r13d
+; AVX512BW-NEXT:    movl %r15d, %r14d
+; AVX512BW-NEXT:    shll $8, %r14d
+; AVX512BW-NEXT:    orl %r13d, %r14d
+; AVX512BW-NEXT:    movl %r15d, %r13d
+; AVX512BW-NEXT:    shll $9, %r13d
+; AVX512BW-NEXT:    orl %r14d, %r13d
+; AVX512BW-NEXT:    movl %r15d, %eax
+; AVX512BW-NEXT:    shll $10, %eax
+; AVX512BW-NEXT:    orl %r13d, %eax
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $30, %k0, %k1
+; AVX512BW-NEXT:    shll $11, %r15d
+; AVX512BW-NEXT:    orl %eax, %r15d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movl %r12d, %eax
+; AVX512BW-NEXT:    shll $12, %eax
+; AVX512BW-NEXT:    orl %r15d, %eax
+; AVX512BW-NEXT:    movl %r12d, %r15d
+; AVX512BW-NEXT:    shll $13, %r15d
+; AVX512BW-NEXT:    orl %eax, %r15d
+; AVX512BW-NEXT:    movl %r12d, %eax
+; AVX512BW-NEXT:    shll $14, %eax
+; AVX512BW-NEXT:    orl %r15d, %eax
+; AVX512BW-NEXT:    movl %r12d, %r13d
+; AVX512BW-NEXT:    shll $15, %r13d
+; AVX512BW-NEXT:    orl %eax, %r13d
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $31, %k0, %k1
+; AVX512BW-NEXT:    shll $16, %r12d
+; AVX512BW-NEXT:    orl %r13d, %r12d
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movl %r14d, %eax
+; AVX512BW-NEXT:    shll $17, %eax
+; AVX512BW-NEXT:    orl %r12d, %eax
+; AVX512BW-NEXT:    movl %r14d, %r12d
+; AVX512BW-NEXT:    shll $18, %r12d
+; AVX512BW-NEXT:    orl %eax, %r12d
+; AVX512BW-NEXT:    movl %r14d, %eax
+; AVX512BW-NEXT:    shll $19, %eax
+; AVX512BW-NEXT:    orl %r12d, %eax
+; AVX512BW-NEXT:    movl %r14d, %r12d
+; AVX512BW-NEXT:    shll $20, %r12d
+; AVX512BW-NEXT:    orl %eax, %r12d
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrd $6, %k0, %k1
+; AVX512BW-NEXT:    shll $21, %r14d
+; AVX512BW-NEXT:    orl %r12d, %r14d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movl %r15d, %r12d
+; AVX512BW-NEXT:    shll $22, %r12d
+; AVX512BW-NEXT:    orl %r14d, %r12d
+; AVX512BW-NEXT:    movl %r15d, %r14d
+; AVX512BW-NEXT:    shll $23, %r14d
+; AVX512BW-NEXT:    orl %r12d, %r14d
+; AVX512BW-NEXT:    movl %r15d, %r12d
+; AVX512BW-NEXT:    shll $24, %r12d
+; AVX512BW-NEXT:    orl %r14d, %r12d
+; AVX512BW-NEXT:    movl %r15d, %r14d
+; AVX512BW-NEXT:    shll $25, %r14d
+; AVX512BW-NEXT:    orl %r12d, %r14d
+; AVX512BW-NEXT:    shll $26, %r15d
+; AVX512BW-NEXT:    orl %r14d, %r15d
+; AVX512BW-NEXT:    movl %eax, %r14d
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movl %r14d, %r12d
+; AVX512BW-NEXT:    shll $27, %r12d
+; AVX512BW-NEXT:    orl %r15d, %r12d
+; AVX512BW-NEXT:    movl %r14d, %r15d
+; AVX512BW-NEXT:    shll $28, %r15d
+; AVX512BW-NEXT:    orl %r12d, %r15d
+; AVX512BW-NEXT:    movl %r14d, %r12d
+; AVX512BW-NEXT:    shll $29, %r12d
+; AVX512BW-NEXT:    orl %r15d, %r12d
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $1, %k0, %k1
+; AVX512BW-NEXT:    shll $30, %r14d
+; AVX512BW-NEXT:    orl %r12d, %r14d
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $2, %k0, %k1
+; AVX512BW-NEXT:    shll $31, %eax
+; AVX512BW-NEXT:    orl %r14d, %eax
+; AVX512BW-NEXT:    orl %ebp, %eax
+; AVX512BW-NEXT:    movl %eax, 16(%rsi)
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movzbl %al, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r14
+; AVX512BW-NEXT:    leaq (%r14,%rax,4), %r14
+; AVX512BW-NEXT:    leaq (%r14,%rax,8), %r14
+; AVX512BW-NEXT:    shlq $4, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movzbl %r12b, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %r14
+; AVX512BW-NEXT:    shlq $5, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrd $3, %k0, %k1
+; AVX512BW-NEXT:    shlq $9, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $13, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrd $4, %k0, %k1
+; AVX512BW-NEXT:    shlq $14, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $15, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $16, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $17, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $18, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrd $5, %k0, %k1
+; AVX512BW-NEXT:    shlq $19, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $21, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $23, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrd $12, %k0, %k1
+; AVX512BW-NEXT:    shlq $24, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $25, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $26, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $27, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $28, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    shlq $29, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movzbl %r15b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $30, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrd $7, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $32, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $8, %k0, %k1
+; AVX512BW-NEXT:    shlq $34, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $36, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $37, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $38, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $9, %k0, %k1
+; AVX512BW-NEXT:    shlq $39, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $43, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $10, %k0, %k1
+; AVX512BW-NEXT:    shlq $44, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $45, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $46, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $47, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $48, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $11, %k0, %k1
+; AVX512BW-NEXT:    shlq $49, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $51, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $53, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrd $20, %k0, %k1
+; AVX512BW-NEXT:    shlq $54, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $55, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    shlq $56, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $57, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    shlq $58, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    shlq $59, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movl %ebp, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $60, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $61, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrd $21, %k0, %k1
+; AVX512BW-NEXT:    movzbl %bpl, %ebp
+; AVX512BW-NEXT:    shlq $62, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movq %rbp, %r12
+; AVX512BW-NEXT:    shlq $63, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $22, %k0, %k1
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $23, %k0, %k1
+; AVX512BW-NEXT:    movq %r12, (%rsi)
+; AVX512BW-NEXT:    movzbl %bl, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    leaq (%rbp,%rbx,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%rbx,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%rbx,8), %rax
+; AVX512BW-NEXT:    movq %rbx, %r12
+; AVX512BW-NEXT:    shlq $4, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    shlq $5, %rbx
+; AVX512BW-NEXT:    orq %r12, %rbx
+; AVX512BW-NEXT:    movzbl %r11b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r11
+; AVX512BW-NEXT:    shlq $6, %r11
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %r11, %r12
+; AVX512BW-NEXT:    movq %rax, %r11
+; AVX512BW-NEXT:    shlq $8, %r11
+; AVX512BW-NEXT:    orq %r12, %r11
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %r11, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrd $24, %k0, %k0
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %r10b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $11, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $13, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    kmovd %k0, %r10d
+; AVX512BW-NEXT:    shlq $15, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %r8b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $16, %r8
+; AVX512BW-NEXT:    orq %r12, %r8
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $17, %r12
+; AVX512BW-NEXT:    orq %r8, %r12
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $18, %r8
+; AVX512BW-NEXT:    orq %r12, %r8
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $19, %r12
+; AVX512BW-NEXT:    orq %r8, %r12
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %dil, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $21, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $23, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    shlq $25, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movzbl %dl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $26, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $27, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $28, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $29, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $31, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $33, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    shlq $35, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r13, %rdx
+; AVX512BW-NEXT:    shlq $37, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r13, %rdx
+; AVX512BW-NEXT:    shlq $39, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $40, %r13
+; AVX512BW-NEXT:    orq %rdx, %r13
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $41, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %rdx
+; AVX512BW-NEXT:    shlq $42, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $43, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r15, %rdx
+; AVX512BW-NEXT:    shlq $44, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $45, %r15
+; AVX512BW-NEXT:    orq %rdx, %r15
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r14, %rdx
+; AVX512BW-NEXT:    shlq $47, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r14, %rdx
+; AVX512BW-NEXT:    shlq $49, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $50, %r14
+; AVX512BW-NEXT:    orq %rdx, %r14
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $51, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r11, %rdx
+; AVX512BW-NEXT:    shlq $52, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $53, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r11, %rdx
+; AVX512BW-NEXT:    shlq $54, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $55, %r11
+; AVX512BW-NEXT:    orq %rdx, %r11
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r10, %rdx
+; AVX512BW-NEXT:    shlq $57, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r10, %rdx
+; AVX512BW-NEXT:    shlq $59, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $60, %r10
+; AVX512BW-NEXT:    orq %rdx, %r10
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $61, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    shlq $62, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    movzbl %cl, %eax
+; AVX512BW-NEXT:    shlq $63, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %rax, 8(%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r13
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <32 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
+  store <160 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor5_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor5_vf64:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r13
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k3
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k2
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k2, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k2, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r15,2), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%r15,4), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%r15,8), %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r11
+; AVX512F-ONLY-NEXT:    shlq $5, %r11
+; AVX512F-ONLY-NEXT:    orq %r15, %r11
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $6, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $7, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $8, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $9, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $10, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $11, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $12, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $13, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $14, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $15, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $16, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $17, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $18, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $19, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $20, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $21, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $22, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $23, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $24, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $25, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $26, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $27, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $28, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $29, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $30, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $31, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r13
+; AVX512F-ONLY-NEXT:    shlq $33, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $34, %r10
+; AVX512F-ONLY-NEXT:    orq %r13, %r10
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $36, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $37, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $38, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $39, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r13
+; AVX512F-ONLY-NEXT:    shlq $43, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $44, %r10
+; AVX512F-ONLY-NEXT:    orq %r13, %r10
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $45, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $46, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $48, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $49, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r13
+; AVX512F-ONLY-NEXT:    shlq $53, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $54, %r10
+; AVX512F-ONLY-NEXT:    orq %r13, %r10
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $56, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $58, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $59, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r10d
+; AVX512F-ONLY-NEXT:    movl %r15d, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $60, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $62, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $63, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    orq %r11, %r12
+; AVX512F-ONLY-NEXT:    movq %r12, (%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r11d
+; AVX512F-ONLY-NEXT:    movl %r11d, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,8), %rax
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $4, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $5, %r14
+; AVX512F-ONLY-NEXT:    orq %r15, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $7, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $8, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $9, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $10, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $11, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $12, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $13, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $15, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $17, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $18, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $19, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $20, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $21, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $22, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $23, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $24, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $25, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $26, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $27, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $28, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $29, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $30, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %r13
+; AVX512F-ONLY-NEXT:    shlq $32, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $33, %rbx
+; AVX512F-ONLY-NEXT:    orq %r13, %rbx
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rbx
+; AVX512F-ONLY-NEXT:    shlq $35, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $37, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $38, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $39, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r12
+; AVX512F-ONLY-NEXT:    shlq $40, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $41, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r13
+; AVX512F-ONLY-NEXT:    shlq $42, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $43, %rbx
+; AVX512F-ONLY-NEXT:    orq %r13, %rbx
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rbx
+; AVX512F-ONLY-NEXT:    shlq $45, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rbx
+; AVX512F-ONLY-NEXT:    shlq $47, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $48, %r12
+; AVX512F-ONLY-NEXT:    orq %rbx, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rbx
+; AVX512F-ONLY-NEXT:    shlq $50, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $51, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $52, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k4
+; AVX512F-ONLY-NEXT:    shlq $53, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r12
+; AVX512F-ONLY-NEXT:    shlq $55, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r12
+; AVX512F-ONLY-NEXT:    shlq $57, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $58, %rbx
+; AVX512F-ONLY-NEXT:    orq %r12, %rbx
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
+; AVX512F-ONLY-NEXT:    # kill: def $r15d killed $r15d def $r15
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $59, %r12
+; AVX512F-ONLY-NEXT:    orq %rbx, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rbx
+; AVX512F-ONLY-NEXT:    shlq $60, %rbx
+; AVX512F-ONLY-NEXT:    orq %r12, %rbx
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %rbx, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k4
+; AVX512F-ONLY-NEXT:    shlq $62, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k4
+; AVX512F-ONLY-NEXT:    shlq $63, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %rax, 32(%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movzbl %bl, %r14d
+; AVX512F-ONLY-NEXT:    movl %r14d, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r15,2), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,8), %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $5, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $7, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $9, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k4
+; AVX512F-ONLY-NEXT:    shlq $11, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $13, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $15, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k4
+; AVX512F-ONLY-NEXT:    shlq $16, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $17, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $18, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $19, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $20, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k4
+; AVX512F-ONLY-NEXT:    shlq $21, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $22, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $23, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $24, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $25, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $26, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $27, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $28, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $29, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $30, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $31, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $33, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $35, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $36, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $37, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $38, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $39, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $40, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $43, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $45, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $46, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $48, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $50, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $53, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $55, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $56, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $58, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $59, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $60, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    shlq $62, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $63, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k3, %k4
+; AVX512F-ONLY-NEXT:    movq %r12, 16(%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    leaq (%r13,%r13,2), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r13,4), %r15
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r13,8), %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $5, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $6, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $9, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $11, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $12, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $13, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $14, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $15, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $16, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $17, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $19, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $20, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $21, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $22, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $24, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $25, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $26, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $27, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $28, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $29, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $30, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $31, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $32, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $34, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $36, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k3, %k3
+; AVX512F-ONLY-NEXT:    shlq $37, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $39, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $41, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $42, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $43, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $44, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $45, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $46, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $47, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $49, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $51, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512F-ONLY-NEXT:    shlq $52, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $53, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $54, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $56, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512F-ONLY-NEXT:    shlq $57, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $58, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $59, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $61, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $62, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $63, %r11
+; AVX512F-ONLY-NEXT:    orq %r12, %r11
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512F-ONLY-NEXT:    orq %r15, %r11
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r11, 24(%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r11d
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    leaq (%r10,%r11,2), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r11,4), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r11,8), %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %r10
+; AVX512F-ONLY-NEXT:    shlq $4, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $5, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $6, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $7, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $8, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %rbp
+; AVX512F-ONLY-NEXT:    shlq $9, %rbp
+; AVX512F-ONLY-NEXT:    orq %r9, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %rbp, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $11, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $13, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r8d
+; AVX512F-ONLY-NEXT:    shlq $15, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $16, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $17, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $18, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $19, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    shlq $20, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r9
+; AVX512F-ONLY-NEXT:    shlq $21, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $22, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %r9
+; AVX512F-ONLY-NEXT:    shlq $23, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $24, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $25, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    movzbl %dl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $26, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $27, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $28, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $29, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $30, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movzbl %cl, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $31, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $33, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $37, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $39, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $40, %r13
+; AVX512F-ONLY-NEXT:    orq %rcx, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $41, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $42, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $43, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $44, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $45, %r12
+; AVX512F-ONLY-NEXT:    orq %rcx, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $47, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $49, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $50, %r15
+; AVX512F-ONLY-NEXT:    orq %rcx, %r15
+; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $51, %rcx
+; AVX512F-ONLY-NEXT:    orq %r15, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $52, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $53, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $54, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %rcx
+; AVX512F-ONLY-NEXT:    shlq $56, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %rcx
+; AVX512F-ONLY-NEXT:    shlq $58, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $59, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $61, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $62, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $63, %r14
+; AVX512F-ONLY-NEXT:    orq %rbx, %r14
+; AVX512F-ONLY-NEXT:    orq %r11, %r14
+; AVX512F-ONLY-NEXT:    movq %r14, 8(%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r13
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor5_vf64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k3
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k2
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ecx
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %edx
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %edi
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r8d
+; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r9d
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ebx
+; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r14d
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r10d
+; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r11d
+; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k1, %r15d
+; AVX512DQ-NEXT:    movzbl %r15b, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    leaq (%r15,%r15,2), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%r15,4), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%r15,8), %r12
+; AVX512DQ-NEXT:    shlq $4, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %r11b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r11
+; AVX512DQ-NEXT:    shlq $5, %r11
+; AVX512DQ-NEXT:    orq %r15, %r11
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $6, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $7, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $8, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k4
+; AVX512DQ-NEXT:    shlq $9, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %r13b, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $10, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $11, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $12, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $13, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k4
+; AVX512DQ-NEXT:    shlq $14, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %r13b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $15, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $16, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $17, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $18, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k4
+; AVX512DQ-NEXT:    shlq $19, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %r13b, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $20, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $21, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $22, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $23, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512DQ-NEXT:    shlq $24, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %r13b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $25, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $26, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $27, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $28, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    shlq $29, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %r10b, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $30, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k4
+; AVX512DQ-NEXT:    shlq $31, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r10, %r13
+; AVX512DQ-NEXT:    shlq $33, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k4
+; AVX512DQ-NEXT:    shlq $34, %r10
+; AVX512DQ-NEXT:    orq %r13, %r10
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $36, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $37, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $38, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r10d
+; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k4
+; AVX512DQ-NEXT:    shlq $39, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r13
+; AVX512DQ-NEXT:    shlq $43, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k4
+; AVX512DQ-NEXT:    shlq $44, %r10
+; AVX512DQ-NEXT:    orq %r13, %r10
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $45, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $46, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $48, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r10d
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512DQ-NEXT:    shlq $49, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r13
+; AVX512DQ-NEXT:    shlq $53, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k4
+; AVX512DQ-NEXT:    shlq $54, %r10
+; AVX512DQ-NEXT:    orq %r13, %r10
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $56, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $58, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    shlq $59, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    movzbl %r15b, %r10d
+; AVX512DQ-NEXT:    movl %r15d, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $60, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k4
+; AVX512DQ-NEXT:    shlq $62, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $63, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    orq %r11, %r12
+; AVX512DQ-NEXT:    movq %r12, (%rsi)
+; AVX512DQ-NEXT:    movzbl %r15b, %r11d
+; AVX512DQ-NEXT:    movl %r11d, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%rax,8), %rax
+; AVX512DQ-NEXT:    movzbl %r14b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $4, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $5, %r14
+; AVX512DQ-NEXT:    orq %r15, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $7, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k4
+; AVX512DQ-NEXT:    shlq $8, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $9, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $10, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $11, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $12, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k4
+; AVX512DQ-NEXT:    shlq $13, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $15, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $17, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k4
+; AVX512DQ-NEXT:    shlq $18, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $19, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $20, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $21, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $22, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k4
+; AVX512DQ-NEXT:    shlq $23, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $24, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $25, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $26, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $27, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    shlq $28, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %bl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $29, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $30, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k4
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %r13
+; AVX512DQ-NEXT:    shlq $32, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k4
+; AVX512DQ-NEXT:    shlq $33, %rbx
+; AVX512DQ-NEXT:    orq %r13, %rbx
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rbx
+; AVX512DQ-NEXT:    shlq $35, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $37, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %ebx
+; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k4
+; AVX512DQ-NEXT:    shlq $38, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $39, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r12
+; AVX512DQ-NEXT:    shlq $40, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $41, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r13
+; AVX512DQ-NEXT:    shlq $42, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k4
+; AVX512DQ-NEXT:    shlq $43, %rbx
+; AVX512DQ-NEXT:    orq %r13, %rbx
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rbx
+; AVX512DQ-NEXT:    shlq $45, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rbx
+; AVX512DQ-NEXT:    shlq $47, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k4
+; AVX512DQ-NEXT:    shlq $48, %r12
+; AVX512DQ-NEXT:    orq %rbx, %r12
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $49, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %rbx
+; AVX512DQ-NEXT:    shlq $50, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $51, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $52, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %ebx
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k4
+; AVX512DQ-NEXT:    shlq $53, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r12
+; AVX512DQ-NEXT:    shlq $55, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r12
+; AVX512DQ-NEXT:    shlq $57, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $58, %rbx
+; AVX512DQ-NEXT:    orq %r12, %rbx
+; AVX512DQ-NEXT:    movzbl %r15b, %eax
+; AVX512DQ-NEXT:    # kill: def $r15d killed $r15d def $r15
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $59, %r12
+; AVX512DQ-NEXT:    orq %rbx, %r12
+; AVX512DQ-NEXT:    movq %r15, %rbx
+; AVX512DQ-NEXT:    shlq $60, %rbx
+; AVX512DQ-NEXT:    orq %r12, %rbx
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %rbx, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %ebx
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k4
+; AVX512DQ-NEXT:    shlq $62, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k4
+; AVX512DQ-NEXT:    shlq $63, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %rax, 32(%rsi)
+; AVX512DQ-NEXT:    movzbl %r12b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movzbl %bl, %r14d
+; AVX512DQ-NEXT:    movl %r14d, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    leaq (%r15,%r15,2), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%rax,8), %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $5, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $7, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $9, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k4
+; AVX512DQ-NEXT:    shlq $11, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $13, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $15, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k4
+; AVX512DQ-NEXT:    shlq $16, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $17, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $18, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $19, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $20, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k4
+; AVX512DQ-NEXT:    shlq $21, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $22, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $23, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $24, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $25, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $1, %k3, %k4
+; AVX512DQ-NEXT:    shlq $26, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $27, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $28, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $29, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $30, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    shlq $31, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $33, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $35, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
+; AVX512DQ-NEXT:    shlq $36, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $37, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $38, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $39, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $40, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $43, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $45, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $4, %k3, %k4
+; AVX512DQ-NEXT:    shlq $46, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $48, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $49, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $50, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $5, %k3, %k4
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $53, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $55, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512DQ-NEXT:    shlq $56, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $58, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $59, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $60, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    shlq $62, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $63, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $9, %k3, %k4
+; AVX512DQ-NEXT:    movq %r12, 16(%rsi)
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    leaq (%r13,%r13,2), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%r13,4), %r15
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    leaq (%r15,%r13,8), %r15
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $5, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $6, %r12
+; AVX512DQ-NEXT:    shlq $7, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $9, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $11, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512DQ-NEXT:    shlq $12, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $13, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $14, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $15, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $16, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $11, %k3, %k4
+; AVX512DQ-NEXT:    shlq $17, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $19, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $20, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $21, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $12, %k3, %k4
+; AVX512DQ-NEXT:    shlq $22, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $24, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $25, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $26, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k4
+; AVX512DQ-NEXT:    shlq $27, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $28, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $29, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $30, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $31, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512DQ-NEXT:    shlq $32, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $34, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $36, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $15, %k3, %k3
+; AVX512DQ-NEXT:    shlq $37, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $39, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $41, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512DQ-NEXT:    shlq $42, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $43, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $44, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $45, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $46, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $47, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $49, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $51, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k3, %r13d
+; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k2
+; AVX512DQ-NEXT:    shlq $52, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $53, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $54, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $56, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512DQ-NEXT:    shlq $57, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $58, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $59, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $60, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $61, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512DQ-NEXT:    shlq $62, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k1, %r13d
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512DQ-NEXT:    shlq $63, %r11
+; AVX512DQ-NEXT:    orq %r12, %r11
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512DQ-NEXT:    orq %r15, %r11
+; AVX512DQ-NEXT:    kmovw %k1, %r15d
+; AVX512DQ-NEXT:    movq %r11, 24(%rsi)
+; AVX512DQ-NEXT:    movzbl %bpl, %r11d
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    leaq (%r10,%r11,2), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%r11,4), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%r11,8), %rax
+; AVX512DQ-NEXT:    movq %r11, %r10
+; AVX512DQ-NEXT:    shlq $4, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    shlq $5, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movzbl %r9b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $6, %r9
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $7, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $8, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %rax, %rbp
+; AVX512DQ-NEXT:    shlq $9, %rbp
+; AVX512DQ-NEXT:    orq %r9, %rbp
+; AVX512DQ-NEXT:    kmovw %k0, %r10d
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %rbp, %rax
+; AVX512DQ-NEXT:    movzbl %r8b, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %r8
+; AVX512DQ-NEXT:    shlq $11, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %r9, %r8
+; AVX512DQ-NEXT:    shlq $13, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    kmovw %k0, %r8d
+; AVX512DQ-NEXT:    shlq $15, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movzbl %r10b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $16, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $17, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $18, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $19, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    shlq $20, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movzbl %dil, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r9
+; AVX512DQ-NEXT:    shlq $21, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $22, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %rdi, %r9
+; AVX512DQ-NEXT:    shlq $23, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $24, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    shlq $25, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    movzbl %dl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $26, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $27, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $28, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $29, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $30, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movzbl %cl, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $31, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $33, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $35, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $37, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $39, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $40, %r13
+; AVX512DQ-NEXT:    orq %rcx, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $41, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $42, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $43, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $44, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $45, %r12
+; AVX512DQ-NEXT:    orq %rcx, %r12
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $47, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $49, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $50, %r15
+; AVX512DQ-NEXT:    orq %rcx, %r15
+; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $51, %rcx
+; AVX512DQ-NEXT:    orq %r15, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $52, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $53, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $54, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %rcx
+; AVX512DQ-NEXT:    shlq $56, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r8, %rcx
+; AVX512DQ-NEXT:    shlq $58, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $59, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $60, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $61, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    shlq $62, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $63, %r14
+; AVX512DQ-NEXT:    orq %rbx, %r14
+; AVX512DQ-NEXT:    orq %r11, %r14
+; AVX512DQ-NEXT:    movq %r14, 8(%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor5_vf64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r13
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrq $19, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrq $18, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrq $17, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrq $16, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrq $15, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrq $14, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrq $27, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $57, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrq $52, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrq $1, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrq $2, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k0, %r15d
+; AVX512BW-NEXT:    movzbl %r15b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    leaq (%r15,%r15,2), %r12
+; AVX512BW-NEXT:    leaq (%r12,%r15,4), %r12
+; AVX512BW-NEXT:    leaq (%r12,%r15,8), %r12
+; AVX512BW-NEXT:    shlq $4, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %r11b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r11
+; AVX512BW-NEXT:    shlq $5, %r11
+; AVX512BW-NEXT:    orq %r15, %r11
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $6, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $7, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $8, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $3, %k0, %k1
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %r13b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $10, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $11, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $12, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $13, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $4, %k0, %k1
+; AVX512BW-NEXT:    shlq $14, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %r13b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $15, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $16, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $17, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $18, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $5, %k0, %k1
+; AVX512BW-NEXT:    shlq $19, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %r13b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $20, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $21, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $22, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $23, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $12, %k0, %k1
+; AVX512BW-NEXT:    shlq $24, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %r13b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $25, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $26, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $27, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $28, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    shlq $29, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %r10b, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $7, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r10, %r13
+; AVX512BW-NEXT:    shlq $33, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $8, %k0, %k1
+; AVX512BW-NEXT:    shlq $34, %r10
+; AVX512BW-NEXT:    orq %r13, %r10
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $36, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $37, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $38, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrq $9, %k0, %k1
+; AVX512BW-NEXT:    shlq $39, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r10, %r13
+; AVX512BW-NEXT:    shlq $43, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $10, %k0, %k1
+; AVX512BW-NEXT:    shlq $44, %r10
+; AVX512BW-NEXT:    orq %r13, %r10
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $45, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $46, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $47, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $48, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrq $11, %k0, %k1
+; AVX512BW-NEXT:    shlq $49, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $51, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $53, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $51, %k0, %k1
+; AVX512BW-NEXT:    shlq $54, %r10
+; AVX512BW-NEXT:    orq %r12, %r10
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $55, %r12
+; AVX512BW-NEXT:    orq %r10, %r12
+; AVX512BW-NEXT:    movq %rax, %r10
+; AVX512BW-NEXT:    shlq $56, %r10
+; AVX512BW-NEXT:    orq %r12, %r10
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $57, %r12
+; AVX512BW-NEXT:    orq %r10, %r12
+; AVX512BW-NEXT:    movq %rax, %r10
+; AVX512BW-NEXT:    shlq $58, %r10
+; AVX512BW-NEXT:    orq %r12, %r10
+; AVX512BW-NEXT:    shlq $59, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movl %r15d, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $60, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $61, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $53, %k0, %k1
+; AVX512BW-NEXT:    movzbl %r15b, %r10d
+; AVX512BW-NEXT:    shlq $62, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $63, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %rax, (%rsi)
+; AVX512BW-NEXT:    movzbl %r13b, %r11d
+; AVX512BW-NEXT:    movl %r11d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r15
+; AVX512BW-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512BW-NEXT:    leaq (%r15,%rax,8), %rax
+; AVX512BW-NEXT:    movzbl %r14b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $4, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $5, %r14
+; AVX512BW-NEXT:    orq %r15, %r14
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $7, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $54, %k0, %k1
+; AVX512BW-NEXT:    shlq $8, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %al, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $9, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $10, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $11, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $12, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $55, %k0, %k1
+; AVX512BW-NEXT:    shlq $13, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $15, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $17, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $56, %k0, %k1
+; AVX512BW-NEXT:    shlq $18, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %al, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $19, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $20, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $21, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $22, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $63, %k0, %k1
+; AVX512BW-NEXT:    shlq $23, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $25, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $27, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    shlq $28, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %bl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    shlq $29, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $30, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $58, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $32, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $59, %k0, %k1
+; AVX512BW-NEXT:    shlq $33, %rbx
+; AVX512BW-NEXT:    orq %r13, %rbx
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %rbx
+; AVX512BW-NEXT:    shlq $35, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $37, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrq $60, %k0, %k1
+; AVX512BW-NEXT:    shlq $38, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $39, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbx, %r12
+; AVX512BW-NEXT:    shlq $40, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $41, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $42, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $61, %k0, %k1
+; AVX512BW-NEXT:    shlq $43, %rbx
+; AVX512BW-NEXT:    orq %r13, %rbx
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %rbx
+; AVX512BW-NEXT:    shlq $45, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $47, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrq $62, %k0, %k1
+; AVX512BW-NEXT:    shlq $48, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $49, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbx, %r12
+; AVX512BW-NEXT:    shlq $50, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $51, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbx, %r12
+; AVX512BW-NEXT:    shlq $52, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $25, %k0, %k1
+; AVX512BW-NEXT:    shlq $53, %rbx
+; AVX512BW-NEXT:    orq %r12, %rbx
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $54, %r12
+; AVX512BW-NEXT:    orq %rbx, %r12
+; AVX512BW-NEXT:    movq %rax, %rbx
+; AVX512BW-NEXT:    shlq $55, %rbx
+; AVX512BW-NEXT:    orq %r12, %rbx
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $56, %r12
+; AVX512BW-NEXT:    orq %rbx, %r12
+; AVX512BW-NEXT:    movq %rax, %rbx
+; AVX512BW-NEXT:    shlq $57, %rbx
+; AVX512BW-NEXT:    orq %r12, %rbx
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movl %r15d, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rbx
+; AVX512BW-NEXT:    shlq $59, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrq $26, %k0, %k1
+; AVX512BW-NEXT:    shlq $62, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $28, %k0, %k1
+; AVX512BW-NEXT:    movzbl %r15b, %r15d
+; AVX512BW-NEXT:    shlq $63, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    movq %r15, 32(%rsi)
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movzbl %bl, %r14d
+; AVX512BW-NEXT:    movl %r14d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r13,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r13,8), %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $4, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $5, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    shlq $6, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $7, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $8, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $9, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $10, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $29, %k0, %k1
+; AVX512BW-NEXT:    shlq $11, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $13, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $15, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $30, %k0, %k1
+; AVX512BW-NEXT:    shlq $16, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $17, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $18, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $19, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $20, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $31, %k0, %k1
+; AVX512BW-NEXT:    shlq $21, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $23, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $25, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512BW-NEXT:    shlq $26, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $27, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $28, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $29, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %rbp
+; AVX512BW-NEXT:    shlq $30, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $33, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %r12
+; AVX512BW-NEXT:    orq %rbp, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $33, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $35, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $34, %k0, %k1
+; AVX512BW-NEXT:    shlq $36, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $37, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $38, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $39, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %rbp
+; AVX512BW-NEXT:    shlq $40, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $35, %k0, %k1
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %rbp, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $43, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $45, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $36, %k0, %k1
+; AVX512BW-NEXT:    shlq $46, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $47, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $48, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $49, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %rbp
+; AVX512BW-NEXT:    shlq $50, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $37, %k0, %k1
+; AVX512BW-NEXT:    shlq $51, %r12
+; AVX512BW-NEXT:    orq %rbp, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $53, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $55, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $38, %k0, %k1
+; AVX512BW-NEXT:    shlq $56, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $57, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $58, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $59, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $60, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $40, %k0, %k1
+; AVX512BW-NEXT:    shlq $61, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    shlq $62, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $63, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $39, %k0, %k1
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $41, %k0, %k1
+; AVX512BW-NEXT:    movq %r12, 16(%rsi)
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    leaq (%r13,%r13,2), %r15
+; AVX512BW-NEXT:    leaq (%r15,%r13,4), %r15
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    leaq (%r15,%r13,8), %r15
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $4, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $5, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $6, %r12
+; AVX512BW-NEXT:    shlq $7, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $9, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $42, %k0, %k1
+; AVX512BW-NEXT:    shlq $12, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $13, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $14, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $15, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $16, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $43, %k0, %k1
+; AVX512BW-NEXT:    shlq $17, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $19, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $21, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $44, %k0, %k1
+; AVX512BW-NEXT:    shlq $22, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $23, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $24, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $25, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $26, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $45, %k0, %k1
+; AVX512BW-NEXT:    shlq $27, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $28, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $29, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %rbp
+; AVX512BW-NEXT:    shlq $31, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $46, %k0, %k1
+; AVX512BW-NEXT:    shlq $32, %r12
+; AVX512BW-NEXT:    orq %rbp, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $34, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $36, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $47, %k0, %k1
+; AVX512BW-NEXT:    shlq $37, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $39, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $41, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $48, %k0, %k1
+; AVX512BW-NEXT:    shlq $42, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $43, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $44, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $45, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $46, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $49, %k0, %k1
+; AVX512BW-NEXT:    shlq $47, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $49, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %rbp
+; AVX512BW-NEXT:    shlq $51, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $50, %k0, %k1
+; AVX512BW-NEXT:    shlq $52, %r12
+; AVX512BW-NEXT:    orq %rbp, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $53, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $54, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $55, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $56, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $13, %k0, %k1
+; AVX512BW-NEXT:    shlq $57, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $59, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $20, %k0, %k1
+; AVX512BW-NEXT:    shlq $62, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $21, %k0, %k1
+; AVX512BW-NEXT:    shlq $63, %r11
+; AVX512BW-NEXT:    orq %r12, %r11
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $22, %k0, %k1
+; AVX512BW-NEXT:    orq %r15, %r11
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $23, %k0, %k1
+; AVX512BW-NEXT:    movq %r11, 24(%rsi)
+; AVX512BW-NEXT:    movzbl %bpl, %r11d
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    leaq (%r10,%r11,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r11,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r11,8), %rax
+; AVX512BW-NEXT:    movq %r11, %r10
+; AVX512BW-NEXT:    shlq $4, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    shlq $5, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movzbl %r9b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $6, %r9
+; AVX512BW-NEXT:    movq %rax, %r10
+; AVX512BW-NEXT:    shlq $7, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $8, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movq %rax, %r10
+; AVX512BW-NEXT:    shlq $9, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrq $24, %k0, %k0
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movzbl %r8b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %r8
+; AVX512BW-NEXT:    shlq $11, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %r10, %r8
+; AVX512BW-NEXT:    shlq $13, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    kmovd %k0, %r8d
+; AVX512BW-NEXT:    shlq $15, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movzbl %dil, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $16, %rdi
+; AVX512BW-NEXT:    orq %r10, %rdi
+; AVX512BW-NEXT:    movq %rax, %r10
+; AVX512BW-NEXT:    shlq $17, %r10
+; AVX512BW-NEXT:    orq %rdi, %r10
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $18, %rdi
+; AVX512BW-NEXT:    orq %r10, %rdi
+; AVX512BW-NEXT:    movq %rax, %r10
+; AVX512BW-NEXT:    shlq $19, %r10
+; AVX512BW-NEXT:    orq %rdi, %r10
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movzbl %dl, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $21, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $23, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    shlq $25, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movzbl %cl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $26, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $27, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $28, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $29, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $31, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $33, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    shlq $35, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $37, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $39, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $40, %r13
+; AVX512BW-NEXT:    orq %rcx, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $41, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $42, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $43, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $44, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $45, %r12
+; AVX512BW-NEXT:    orq %rcx, %r12
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $47, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $49, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $50, %r15
+; AVX512BW-NEXT:    orq %rcx, %r15
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $51, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r9, %rcx
+; AVX512BW-NEXT:    shlq $52, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $53, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r9, %rcx
+; AVX512BW-NEXT:    shlq $54, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $55, %r9
+; AVX512BW-NEXT:    orq %rcx, %r9
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r8, %rcx
+; AVX512BW-NEXT:    shlq $57, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r8, %rcx
+; AVX512BW-NEXT:    shlq $59, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $60, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $61, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    shlq $62, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    shlq $63, %r14
+; AVX512BW-NEXT:    orq %rbx, %r14
+; AVX512BW-NEXT:    orq %r11, %r14
+; AVX512BW-NEXT:    movq %r14, 8(%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r13
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <64 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
+  store <320 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor6_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor6_vf2:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512F-ONLY-NEXT:    movl %eax, %edx
+; AVX512F-ONLY-NEXT:    shll $4, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $5, %eax
+; AVX512F-ONLY-NEXT:    orl %edx, %eax
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $6, %edi
+; AVX512F-ONLY-NEXT:    movl %edx, %r8d
+; AVX512F-ONLY-NEXT:    shll $7, %r8d
+; AVX512F-ONLY-NEXT:    orl %edi, %r8d
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $8, %edi
+; AVX512F-ONLY-NEXT:    orl %r8d, %edi
+; AVX512F-ONLY-NEXT:    movl %edx, %r8d
+; AVX512F-ONLY-NEXT:    shll $9, %r8d
+; AVX512F-ONLY-NEXT:    orl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $10, %edx
+; AVX512F-ONLY-NEXT:    orl %r8d, %edx
+; AVX512F-ONLY-NEXT:    shll $11, %ecx
+; AVX512F-ONLY-NEXT:    orl %edx, %ecx
+; AVX512F-ONLY-NEXT:    orl %eax, %ecx
+; AVX512F-ONLY-NEXT:    andl $4095, %ecx # imm = 0xFFF
+; AVX512F-ONLY-NEXT:    movw %cx, (%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor6_vf2:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k1
+; AVX512DQ-NEXT:    kshiftrb $1, %k1, %k0
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512DQ-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512DQ-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512DQ-NEXT:    movl %eax, %edx
+; AVX512DQ-NEXT:    shll $4, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    shll $5, %eax
+; AVX512DQ-NEXT:    orl %edx, %eax
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $6, %edi
+; AVX512DQ-NEXT:    movl %edx, %r8d
+; AVX512DQ-NEXT:    shll $7, %r8d
+; AVX512DQ-NEXT:    orl %edi, %r8d
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $8, %edi
+; AVX512DQ-NEXT:    orl %r8d, %edi
+; AVX512DQ-NEXT:    movl %edx, %r8d
+; AVX512DQ-NEXT:    shll $9, %r8d
+; AVX512DQ-NEXT:    orl %edi, %r8d
+; AVX512DQ-NEXT:    shll $10, %edx
+; AVX512DQ-NEXT:    orl %r8d, %edx
+; AVX512DQ-NEXT:    shll $11, %ecx
+; AVX512DQ-NEXT:    orl %edx, %ecx
+; AVX512DQ-NEXT:    orl %eax, %ecx
+; AVX512DQ-NEXT:    andl $4095, %ecx # imm = 0xFFF
+; AVX512DQ-NEXT:    movw %cx, (%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor6_vf2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k0
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512BW-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512BW-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512BW-NEXT:    movl %eax, %edx
+; AVX512BW-NEXT:    shll $4, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    shll $5, %eax
+; AVX512BW-NEXT:    orl %edx, %eax
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $6, %edi
+; AVX512BW-NEXT:    movl %edx, %r8d
+; AVX512BW-NEXT:    shll $7, %r8d
+; AVX512BW-NEXT:    orl %edi, %r8d
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $8, %edi
+; AVX512BW-NEXT:    orl %r8d, %edi
+; AVX512BW-NEXT:    movl %edx, %r8d
+; AVX512BW-NEXT:    shll $9, %r8d
+; AVX512BW-NEXT:    orl %edi, %r8d
+; AVX512BW-NEXT:    shll $10, %edx
+; AVX512BW-NEXT:    orl %r8d, %edx
+; AVX512BW-NEXT:    shll $11, %ecx
+; AVX512BW-NEXT:    orl %edx, %ecx
+; AVX512BW-NEXT:    orl %eax, %ecx
+; AVX512BW-NEXT:    andl $4095, %ecx # imm = 0xFFF
+; AVX512BW-NEXT:    movw %cx, (%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <2 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  store <12 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor6_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor6_vf4:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k3
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512F-ONLY-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512F-ONLY-NEXT:    movl %eax, %edx
+; AVX512F-ONLY-NEXT:    shll $4, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $5, %eax
+; AVX512F-ONLY-NEXT:    orl %edx, %eax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $6, %edx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edi
+; AVX512F-ONLY-NEXT:    shll $7, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $8, %edx
+; AVX512F-ONLY-NEXT:    orl %edi, %edx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edi
+; AVX512F-ONLY-NEXT:    shll $9, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $10, %edx
+; AVX512F-ONLY-NEXT:    orl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $11, %ecx
+; AVX512F-ONLY-NEXT:    orl %edx, %ecx
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $12, %edi
+; AVX512F-ONLY-NEXT:    orl %ecx, %edi
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $13, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $14, %edi
+; AVX512F-ONLY-NEXT:    orl %ecx, %edi
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $15, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    orl %eax, %ecx
+; AVX512F-ONLY-NEXT:    movw %cx, (%rsi)
+; AVX512F-ONLY-NEXT:    movl %edx, %eax
+; AVX512F-ONLY-NEXT:    shll $16, %eax
+; AVX512F-ONLY-NEXT:    shll $17, %edx
+; AVX512F-ONLY-NEXT:    orl %eax, %edx
+; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $18, %r8d
+; AVX512F-ONLY-NEXT:    orl %edx, %r8d
+; AVX512F-ONLY-NEXT:    movl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $19, %edx
+; AVX512F-ONLY-NEXT:    orl %r8d, %edx
+; AVX512F-ONLY-NEXT:    movl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $20, %r8d
+; AVX512F-ONLY-NEXT:    orl %edx, %r8d
+; AVX512F-ONLY-NEXT:    movl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $21, %edx
+; AVX512F-ONLY-NEXT:    orl %r8d, %edx
+; AVX512F-ONLY-NEXT:    shll $22, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $23, %eax
+; AVX512F-ONLY-NEXT:    orl %edi, %eax
+; AVX512F-ONLY-NEXT:    orl %ecx, %eax
+; AVX512F-ONLY-NEXT:    shrl $16, %eax
+; AVX512F-ONLY-NEXT:    movb %al, 2(%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor6_vf4:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k3
+; AVX512DQ-NEXT:    kshiftrb $3, %k3, %k0
+; AVX512DQ-NEXT:    kshiftrb $2, %k3, %k1
+; AVX512DQ-NEXT:    kshiftrb $1, %k3, %k2
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512DQ-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512DQ-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512DQ-NEXT:    movl %eax, %edx
+; AVX512DQ-NEXT:    shll $4, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    shll $5, %eax
+; AVX512DQ-NEXT:    orl %edx, %eax
+; AVX512DQ-NEXT:    kmovw %k2, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    shll $6, %edx
+; AVX512DQ-NEXT:    movl %ecx, %edi
+; AVX512DQ-NEXT:    shll $7, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    shll $8, %edx
+; AVX512DQ-NEXT:    orl %edi, %edx
+; AVX512DQ-NEXT:    movl %ecx, %edi
+; AVX512DQ-NEXT:    shll $9, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    shll $10, %edx
+; AVX512DQ-NEXT:    orl %edi, %edx
+; AVX512DQ-NEXT:    shll $11, %ecx
+; AVX512DQ-NEXT:    orl %edx, %ecx
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $12, %edi
+; AVX512DQ-NEXT:    orl %ecx, %edi
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $13, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $14, %edi
+; AVX512DQ-NEXT:    orl %ecx, %edi
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $15, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    orl %eax, %ecx
+; AVX512DQ-NEXT:    movw %cx, (%rsi)
+; AVX512DQ-NEXT:    movl %edx, %eax
+; AVX512DQ-NEXT:    shll $16, %eax
+; AVX512DQ-NEXT:    shll $17, %edx
+; AVX512DQ-NEXT:    orl %eax, %edx
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    movl %eax, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movl %edi, %r8d
+; AVX512DQ-NEXT:    shll $18, %r8d
+; AVX512DQ-NEXT:    orl %edx, %r8d
+; AVX512DQ-NEXT:    movl %edi, %edx
+; AVX512DQ-NEXT:    shll $19, %edx
+; AVX512DQ-NEXT:    orl %r8d, %edx
+; AVX512DQ-NEXT:    movl %edi, %r8d
+; AVX512DQ-NEXT:    shll $20, %r8d
+; AVX512DQ-NEXT:    orl %edx, %r8d
+; AVX512DQ-NEXT:    movl %edi, %edx
+; AVX512DQ-NEXT:    shll $21, %edx
+; AVX512DQ-NEXT:    orl %r8d, %edx
+; AVX512DQ-NEXT:    shll $22, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    shll $23, %eax
+; AVX512DQ-NEXT:    orl %edi, %eax
+; AVX512DQ-NEXT:    orl %ecx, %eax
+; AVX512DQ-NEXT:    shrl $16, %eax
+; AVX512DQ-NEXT:    movb %al, 2(%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor6_vf4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k3, %k2
+; AVX512BW-NEXT:    kmovd %k3, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leal (%rax,%rax,2), %ecx
+; AVX512BW-NEXT:    leal (%rcx,%rax,4), %ecx
+; AVX512BW-NEXT:    leal (%rcx,%rax,8), %ecx
+; AVX512BW-NEXT:    movl %eax, %edx
+; AVX512BW-NEXT:    shll $4, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    shll $5, %eax
+; AVX512BW-NEXT:    orl %edx, %eax
+; AVX512BW-NEXT:    kmovd %k2, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    shll $6, %edx
+; AVX512BW-NEXT:    movl %ecx, %edi
+; AVX512BW-NEXT:    shll $7, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    shll $8, %edx
+; AVX512BW-NEXT:    orl %edi, %edx
+; AVX512BW-NEXT:    movl %ecx, %edi
+; AVX512BW-NEXT:    shll $9, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    shll $10, %edx
+; AVX512BW-NEXT:    orl %edi, %edx
+; AVX512BW-NEXT:    shll $11, %ecx
+; AVX512BW-NEXT:    orl %edx, %ecx
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $12, %edi
+; AVX512BW-NEXT:    orl %ecx, %edi
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $13, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $14, %edi
+; AVX512BW-NEXT:    orl %ecx, %edi
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $15, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    orl %eax, %ecx
+; AVX512BW-NEXT:    movw %cx, (%rsi)
+; AVX512BW-NEXT:    movl %edx, %eax
+; AVX512BW-NEXT:    shll $16, %eax
+; AVX512BW-NEXT:    shll $17, %edx
+; AVX512BW-NEXT:    orl %eax, %edx
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movl %eax, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movl %edi, %r8d
+; AVX512BW-NEXT:    shll $18, %r8d
+; AVX512BW-NEXT:    orl %edx, %r8d
+; AVX512BW-NEXT:    movl %edi, %edx
+; AVX512BW-NEXT:    shll $19, %edx
+; AVX512BW-NEXT:    orl %r8d, %edx
+; AVX512BW-NEXT:    movl %edi, %r8d
+; AVX512BW-NEXT:    shll $20, %r8d
+; AVX512BW-NEXT:    orl %edx, %r8d
+; AVX512BW-NEXT:    movl %edi, %edx
+; AVX512BW-NEXT:    shll $21, %edx
+; AVX512BW-NEXT:    orl %r8d, %edx
+; AVX512BW-NEXT:    shll $22, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    shll $23, %eax
+; AVX512BW-NEXT:    orl %edi, %eax
+; AVX512BW-NEXT:    orl %ecx, %eax
+; AVX512BW-NEXT:    shrl $16, %eax
+; AVX512BW-NEXT:    movb %al, 2(%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <4 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  store <24 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor6_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor6_vf8:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
+; AVX512F-ONLY-NEXT:    movzbl %dl, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    leaq (%rdx,%rdx,2), %r11
+; AVX512F-ONLY-NEXT:    leaq (%r11,%rdx,4), %r11
+; AVX512F-ONLY-NEXT:    leaq (%r11,%rdx,8), %r11
+; AVX512F-ONLY-NEXT:    movq %rdx, %rbx
+; AVX512F-ONLY-NEXT:    shlq $4, %rbx
+; AVX512F-ONLY-NEXT:    orq %r11, %rbx
+; AVX512F-ONLY-NEXT:    shlq $5, %rdx
+; AVX512F-ONLY-NEXT:    orq %rbx, %rdx
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r11
+; AVX512F-ONLY-NEXT:    shlq $6, %r11
+; AVX512F-ONLY-NEXT:    movq %r10, %rbx
+; AVX512F-ONLY-NEXT:    shlq $7, %rbx
+; AVX512F-ONLY-NEXT:    orq %r11, %rbx
+; AVX512F-ONLY-NEXT:    movq %r10, %r11
+; AVX512F-ONLY-NEXT:    shlq $8, %r11
+; AVX512F-ONLY-NEXT:    orq %rbx, %r11
+; AVX512F-ONLY-NEXT:    movq %r10, %rbx
+; AVX512F-ONLY-NEXT:    shlq $9, %rbx
+; AVX512F-ONLY-NEXT:    orq %r11, %rbx
+; AVX512F-ONLY-NEXT:    movq %r10, %r11
+; AVX512F-ONLY-NEXT:    shlq $10, %r11
+; AVX512F-ONLY-NEXT:    orq %rbx, %r11
+; AVX512F-ONLY-NEXT:    shlq $11, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $12, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $13, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $14, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $15, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $16, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    shlq $17, %r9
+; AVX512F-ONLY-NEXT:    orq %r11, %r9
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %r10
+; AVX512F-ONLY-NEXT:    shlq $18, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $19, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %r8, %r10
+; AVX512F-ONLY-NEXT:    shlq $20, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $21, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %r8, %r10
+; AVX512F-ONLY-NEXT:    shlq $22, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $23, %r8
+; AVX512F-ONLY-NEXT:    orq %r10, %r8
+; AVX512F-ONLY-NEXT:    movzbl %dil, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %rdi
+; AVX512F-ONLY-NEXT:    shlq $24, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $25, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %r9, %rdi
+; AVX512F-ONLY-NEXT:    shlq $26, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $27, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %r9, %rdi
+; AVX512F-ONLY-NEXT:    shlq $28, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    shlq $29, %r9
+; AVX512F-ONLY-NEXT:    orq %rdi, %r9
+; AVX512F-ONLY-NEXT:    movzbl %cl, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $30, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $31, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movl %edi, (%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $32, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $33, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $34, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    shlq $35, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    kmovw %k0, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $36, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $37, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    movq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $38, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $39, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    movq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $40, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $41, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    movzbl %al, %ecx
+; AVX512F-ONLY-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $42, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $43, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $44, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $45, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    shrq $32, %rcx
+; AVX512F-ONLY-NEXT:    movw %cx, 4(%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor6_vf8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovb (%rdi), %k1
+; AVX512DQ-NEXT:    kshiftrb $7, %k1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    kshiftrb $6, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrb $5, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %ecx
+; AVX512DQ-NEXT:    kshiftrb $4, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %edi
+; AVX512DQ-NEXT:    kshiftrb $3, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r8d
+; AVX512DQ-NEXT:    kshiftrb $2, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r9d
+; AVX512DQ-NEXT:    kshiftrb $1, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r10d
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    movzbl %dl, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    leaq (%rdx,%rdx,2), %r11
+; AVX512DQ-NEXT:    leaq (%r11,%rdx,4), %r11
+; AVX512DQ-NEXT:    leaq (%r11,%rdx,8), %r11
+; AVX512DQ-NEXT:    movq %rdx, %rbx
+; AVX512DQ-NEXT:    shlq $4, %rbx
+; AVX512DQ-NEXT:    orq %r11, %rbx
+; AVX512DQ-NEXT:    shlq $5, %rdx
+; AVX512DQ-NEXT:    orq %rbx, %rdx
+; AVX512DQ-NEXT:    movzbl %r10b, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r11
+; AVX512DQ-NEXT:    shlq $6, %r11
+; AVX512DQ-NEXT:    movq %r10, %rbx
+; AVX512DQ-NEXT:    shlq $7, %rbx
+; AVX512DQ-NEXT:    orq %r11, %rbx
+; AVX512DQ-NEXT:    movq %r10, %r11
+; AVX512DQ-NEXT:    shlq $8, %r11
+; AVX512DQ-NEXT:    orq %rbx, %r11
+; AVX512DQ-NEXT:    movq %r10, %rbx
+; AVX512DQ-NEXT:    shlq $9, %rbx
+; AVX512DQ-NEXT:    orq %r11, %rbx
+; AVX512DQ-NEXT:    movq %r10, %r11
+; AVX512DQ-NEXT:    shlq $10, %r11
+; AVX512DQ-NEXT:    orq %rbx, %r11
+; AVX512DQ-NEXT:    shlq $11, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    movzbl %r9b, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $12, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $13, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $14, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $15, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $16, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    shlq $17, %r9
+; AVX512DQ-NEXT:    orq %r11, %r9
+; AVX512DQ-NEXT:    movzbl %r8b, %r8d
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %r10
+; AVX512DQ-NEXT:    shlq $18, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $19, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %r8, %r10
+; AVX512DQ-NEXT:    shlq $20, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $21, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %r8, %r10
+; AVX512DQ-NEXT:    shlq $22, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    shlq $23, %r8
+; AVX512DQ-NEXT:    orq %r10, %r8
+; AVX512DQ-NEXT:    movzbl %dil, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %rdi
+; AVX512DQ-NEXT:    shlq $24, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %r9, %r8
+; AVX512DQ-NEXT:    shlq $25, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %r9, %rdi
+; AVX512DQ-NEXT:    shlq $26, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %r9, %r8
+; AVX512DQ-NEXT:    shlq $27, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %r9, %rdi
+; AVX512DQ-NEXT:    shlq $28, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    shlq $29, %r9
+; AVX512DQ-NEXT:    orq %rdi, %r9
+; AVX512DQ-NEXT:    movzbl %cl, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $30, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    shlq $31, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movl %edi, (%rsi)
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $32, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $33, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $34, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    shlq $35, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    kmovw %k0, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $36, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    movq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $37, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    movq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $38, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    movq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $39, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    movq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $40, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $41, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    movzbl %al, %ecx
+; AVX512DQ-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $42, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $43, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $44, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $45, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $47, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    shrq $32, %rcx
+; AVX512DQ-NEXT:    movw %cx, 4(%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor6_vf8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    kshiftrw $7, %k1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    kshiftrw $6, %k1, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512BW-NEXT:    kmovd %k2, %ecx
+; AVX512BW-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512BW-NEXT:    kmovd %k2, %edi
+; AVX512BW-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512BW-NEXT:    kmovd %k2, %r8d
+; AVX512BW-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512BW-NEXT:    kmovd %k2, %r9d
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512BW-NEXT:    kmovd %k2, %r10d
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    movzbl %dl, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    leaq (%rdx,%rdx,2), %r11
+; AVX512BW-NEXT:    leaq (%r11,%rdx,4), %r11
+; AVX512BW-NEXT:    leaq (%r11,%rdx,8), %r11
+; AVX512BW-NEXT:    movq %rdx, %rbx
+; AVX512BW-NEXT:    shlq $4, %rbx
+; AVX512BW-NEXT:    orq %r11, %rbx
+; AVX512BW-NEXT:    shlq $5, %rdx
+; AVX512BW-NEXT:    orq %rbx, %rdx
+; AVX512BW-NEXT:    movzbl %r10b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %r11
+; AVX512BW-NEXT:    shlq $6, %r11
+; AVX512BW-NEXT:    movq %r10, %rbx
+; AVX512BW-NEXT:    shlq $7, %rbx
+; AVX512BW-NEXT:    orq %r11, %rbx
+; AVX512BW-NEXT:    movq %r10, %r11
+; AVX512BW-NEXT:    shlq $8, %r11
+; AVX512BW-NEXT:    orq %rbx, %r11
+; AVX512BW-NEXT:    movq %r10, %rbx
+; AVX512BW-NEXT:    shlq $9, %rbx
+; AVX512BW-NEXT:    orq %r11, %rbx
+; AVX512BW-NEXT:    movq %r10, %r11
+; AVX512BW-NEXT:    shlq $10, %r11
+; AVX512BW-NEXT:    orq %rbx, %r11
+; AVX512BW-NEXT:    shlq $11, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    movzbl %r9b, %r9d
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $12, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $13, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $14, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $15, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $16, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    shlq $17, %r9
+; AVX512BW-NEXT:    orq %r11, %r9
+; AVX512BW-NEXT:    movzbl %r8b, %r8d
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %r10
+; AVX512BW-NEXT:    shlq $18, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $19, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movq %r8, %r10
+; AVX512BW-NEXT:    shlq $20, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $21, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movq %r8, %r10
+; AVX512BW-NEXT:    shlq $22, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    shlq $23, %r8
+; AVX512BW-NEXT:    orq %r10, %r8
+; AVX512BW-NEXT:    movzbl %dil, %r9d
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %rdi
+; AVX512BW-NEXT:    shlq $24, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    movq %r9, %r8
+; AVX512BW-NEXT:    shlq $25, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %r9, %rdi
+; AVX512BW-NEXT:    shlq $26, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    movq %r9, %r8
+; AVX512BW-NEXT:    shlq $27, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %r9, %rdi
+; AVX512BW-NEXT:    shlq $28, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    shlq $29, %r9
+; AVX512BW-NEXT:    orq %rdi, %r9
+; AVX512BW-NEXT:    movzbl %cl, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $30, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    shlq $31, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movl %edi, (%rsi)
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $32, %rdx
+; AVX512BW-NEXT:    movq %rcx, %r8
+; AVX512BW-NEXT:    shlq $33, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $34, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    shlq $35, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %r8
+; AVX512BW-NEXT:    shlq $36, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    movq %rdx, %rcx
+; AVX512BW-NEXT:    shlq $37, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    movq %rdx, %r8
+; AVX512BW-NEXT:    shlq $38, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    movq %rdx, %rcx
+; AVX512BW-NEXT:    shlq $39, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    movq %rdx, %r8
+; AVX512BW-NEXT:    shlq $40, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    shlq $41, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    movzbl %al, %ecx
+; AVX512BW-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $42, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $43, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $44, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $45, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    shlq $47, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    shrq $32, %rcx
+; AVX512BW-NEXT:    movw %cx, 4(%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    retq
+  %src.vec = load <8 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  store <48 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor6_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor6_vf16:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r13
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movl %edi, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    leal (%r10,%r10,2), %ebx
+; AVX512F-ONLY-NEXT:    leal (%rbx,%r12,4), %ebx
+; AVX512F-ONLY-NEXT:    leal (%rbx,%r12,8), %ebx
+; AVX512F-ONLY-NEXT:    movl %r12d, %r13d
+; AVX512F-ONLY-NEXT:    shll $4, %r13d
+; AVX512F-ONLY-NEXT:    orl %ebx, %r13d
+; AVX512F-ONLY-NEXT:    movl %r12d, %ebx
+; AVX512F-ONLY-NEXT:    shll $5, %ebx
+; AVX512F-ONLY-NEXT:    orl %r13d, %ebx
+; AVX512F-ONLY-NEXT:    movl %r12d, %r13d
+; AVX512F-ONLY-NEXT:    shll $6, %r13d
+; AVX512F-ONLY-NEXT:    shll $7, %r12d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movl %r14d, %r13d
+; AVX512F-ONLY-NEXT:    shll $8, %r13d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
+; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
+; AVX512F-ONLY-NEXT:    shll $9, %r12d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r12d
+; AVX512F-ONLY-NEXT:    movl %r14d, %r13d
+; AVX512F-ONLY-NEXT:    shll $10, %r13d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
+; AVX512F-ONLY-NEXT:    movl %r14d, %r12d
+; AVX512F-ONLY-NEXT:    shll $11, %r12d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r12d
+; AVX512F-ONLY-NEXT:    movl %r14d, %r13d
+; AVX512F-ONLY-NEXT:    shll $12, %r13d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512F-ONLY-NEXT:    shll $13, %r14d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movl %ebp, %r13d
+; AVX512F-ONLY-NEXT:    shll $14, %r13d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r13d
+; AVX512F-ONLY-NEXT:    movl %ebp, %r14d
+; AVX512F-ONLY-NEXT:    shll $15, %r14d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
+; AVX512F-ONLY-NEXT:    movl %ebp, %r13d
+; AVX512F-ONLY-NEXT:    shll $16, %r13d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r13d
+; AVX512F-ONLY-NEXT:    movl %ebp, %r14d
+; AVX512F-ONLY-NEXT:    shll $17, %r14d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r14d
+; AVX512F-ONLY-NEXT:    movl %ebp, %r13d
+; AVX512F-ONLY-NEXT:    shll $18, %r13d
+; AVX512F-ONLY-NEXT:    orl %r14d, %r13d
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-ONLY-NEXT:    shll $19, %ebp
+; AVX512F-ONLY-NEXT:    orl %r13d, %ebp
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
+; AVX512F-ONLY-NEXT:    shll $20, %r13d
+; AVX512F-ONLY-NEXT:    orl %ebp, %r13d
+; AVX512F-ONLY-NEXT:    movl %r15d, %ebp
+; AVX512F-ONLY-NEXT:    shll $21, %ebp
+; AVX512F-ONLY-NEXT:    orl %r13d, %ebp
+; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
+; AVX512F-ONLY-NEXT:    shll $22, %r13d
+; AVX512F-ONLY-NEXT:    orl %ebp, %r13d
+; AVX512F-ONLY-NEXT:    movl %r15d, %ebp
+; AVX512F-ONLY-NEXT:    shll $23, %ebp
+; AVX512F-ONLY-NEXT:    orl %r13d, %ebp
+; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
+; AVX512F-ONLY-NEXT:    shll $24, %r13d
+; AVX512F-ONLY-NEXT:    orl %ebp, %r13d
+; AVX512F-ONLY-NEXT:    shll $25, %r15d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r15d
+; AVX512F-ONLY-NEXT:    movl %r11d, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movl %r13d, %ebp
+; AVX512F-ONLY-NEXT:    shll $26, %ebp
+; AVX512F-ONLY-NEXT:    orl %r15d, %ebp
+; AVX512F-ONLY-NEXT:    movl %r13d, %r15d
+; AVX512F-ONLY-NEXT:    shll $27, %r15d
+; AVX512F-ONLY-NEXT:    orl %ebp, %r15d
+; AVX512F-ONLY-NEXT:    movl %r13d, %ebp
+; AVX512F-ONLY-NEXT:    shll $28, %ebp
+; AVX512F-ONLY-NEXT:    orl %r15d, %ebp
+; AVX512F-ONLY-NEXT:    movl %r13d, %eax
+; AVX512F-ONLY-NEXT:    shll $29, %eax
+; AVX512F-ONLY-NEXT:    orl %ebp, %eax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
+; AVX512F-ONLY-NEXT:    shll $30, %r13d
+; AVX512F-ONLY-NEXT:    orl %eax, %r13d
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512F-ONLY-NEXT:    shll $31, %r11d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r11d
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r13d
+; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
+; AVX512F-ONLY-NEXT:    orl %ebx, %r11d
+; AVX512F-ONLY-NEXT:    movl %r11d, 8(%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r11d
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    leaq (%r11,%r11,2), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r11,4), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r11,8), %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %rbx
+; AVX512F-ONLY-NEXT:    shlq $4, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $5, %r11
+; AVX512F-ONLY-NEXT:    orq %rbx, %r11
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $6, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $7, %rbx
+; AVX512F-ONLY-NEXT:    orq %r9, %rbx
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $8, %r9
+; AVX512F-ONLY-NEXT:    orq %rbx, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $9, %rbx
+; AVX512F-ONLY-NEXT:    orq %r9, %rbx
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $10, %r9
+; AVX512F-ONLY-NEXT:    orq %rbx, %r9
+; AVX512F-ONLY-NEXT:    shlq $11, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $12, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $13, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $14, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $15, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $16, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $17, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    movzbl %dl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $18, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $19, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $20, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $21, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $22, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $24, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $25, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $26, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $27, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $28, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $29, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    movzbl %cl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $30, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $32, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $34, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $35, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $37, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $39, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rcx
+; AVX512F-ONLY-NEXT:    shlq $43, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rcx
+; AVX512F-ONLY-NEXT:    shlq $45, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $49, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $51, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $53, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $55, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $57, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $58, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $59, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %rcx
+; AVX512F-ONLY-NEXT:    shlq $61, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $62, %r10
+; AVX512F-ONLY-NEXT:    orq %rcx, %r10
+; AVX512F-ONLY-NEXT:    shlq $63, %rdi
+; AVX512F-ONLY-NEXT:    orq %r10, %rdi
+; AVX512F-ONLY-NEXT:    orq %r11, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdi, (%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r13
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor6_vf16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %edi
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r8d
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r9d
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r14d
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r15d
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r11d
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movl %edi, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    leal (%r10,%r10,2), %ebx
+; AVX512DQ-NEXT:    leal (%rbx,%r12,4), %ebx
+; AVX512DQ-NEXT:    leal (%rbx,%r12,8), %ebx
+; AVX512DQ-NEXT:    movl %r12d, %r13d
+; AVX512DQ-NEXT:    shll $4, %r13d
+; AVX512DQ-NEXT:    orl %ebx, %r13d
+; AVX512DQ-NEXT:    movl %r12d, %ebx
+; AVX512DQ-NEXT:    shll $5, %ebx
+; AVX512DQ-NEXT:    orl %r13d, %ebx
+; AVX512DQ-NEXT:    movl %r12d, %r13d
+; AVX512DQ-NEXT:    shll $6, %r13d
+; AVX512DQ-NEXT:    shll $7, %r12d
+; AVX512DQ-NEXT:    orl %r13d, %r12d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movl %r14d, %r13d
+; AVX512DQ-NEXT:    shll $8, %r13d
+; AVX512DQ-NEXT:    orl %r12d, %r13d
+; AVX512DQ-NEXT:    movl %r14d, %r12d
+; AVX512DQ-NEXT:    shll $9, %r12d
+; AVX512DQ-NEXT:    orl %r13d, %r12d
+; AVX512DQ-NEXT:    movl %r14d, %r13d
+; AVX512DQ-NEXT:    shll $10, %r13d
+; AVX512DQ-NEXT:    orl %r12d, %r13d
+; AVX512DQ-NEXT:    movl %r14d, %r12d
+; AVX512DQ-NEXT:    shll $11, %r12d
+; AVX512DQ-NEXT:    orl %r13d, %r12d
+; AVX512DQ-NEXT:    movl %r14d, %r13d
+; AVX512DQ-NEXT:    shll $12, %r13d
+; AVX512DQ-NEXT:    orl %r12d, %r13d
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512DQ-NEXT:    shll $13, %r14d
+; AVX512DQ-NEXT:    orl %r13d, %r14d
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movl %ebp, %r13d
+; AVX512DQ-NEXT:    shll $14, %r13d
+; AVX512DQ-NEXT:    orl %r14d, %r13d
+; AVX512DQ-NEXT:    movl %ebp, %r14d
+; AVX512DQ-NEXT:    shll $15, %r14d
+; AVX512DQ-NEXT:    orl %r13d, %r14d
+; AVX512DQ-NEXT:    movl %ebp, %r13d
+; AVX512DQ-NEXT:    shll $16, %r13d
+; AVX512DQ-NEXT:    orl %r14d, %r13d
+; AVX512DQ-NEXT:    movl %ebp, %r14d
+; AVX512DQ-NEXT:    shll $17, %r14d
+; AVX512DQ-NEXT:    orl %r13d, %r14d
+; AVX512DQ-NEXT:    movl %ebp, %r13d
+; AVX512DQ-NEXT:    shll $18, %r13d
+; AVX512DQ-NEXT:    orl %r14d, %r13d
+; AVX512DQ-NEXT:    kmovw %k1, %r14d
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512DQ-NEXT:    shll $19, %ebp
+; AVX512DQ-NEXT:    orl %r13d, %ebp
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movl %r15d, %r13d
+; AVX512DQ-NEXT:    shll $20, %r13d
+; AVX512DQ-NEXT:    orl %ebp, %r13d
+; AVX512DQ-NEXT:    movl %r15d, %ebp
+; AVX512DQ-NEXT:    shll $21, %ebp
+; AVX512DQ-NEXT:    orl %r13d, %ebp
+; AVX512DQ-NEXT:    movl %r15d, %r13d
+; AVX512DQ-NEXT:    shll $22, %r13d
+; AVX512DQ-NEXT:    orl %ebp, %r13d
+; AVX512DQ-NEXT:    movl %r15d, %ebp
+; AVX512DQ-NEXT:    shll $23, %ebp
+; AVX512DQ-NEXT:    orl %r13d, %ebp
+; AVX512DQ-NEXT:    movl %r15d, %r13d
+; AVX512DQ-NEXT:    shll $24, %r13d
+; AVX512DQ-NEXT:    orl %ebp, %r13d
+; AVX512DQ-NEXT:    shll $25, %r15d
+; AVX512DQ-NEXT:    orl %r13d, %r15d
+; AVX512DQ-NEXT:    movl %r11d, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movl %r13d, %ebp
+; AVX512DQ-NEXT:    shll $26, %ebp
+; AVX512DQ-NEXT:    orl %r15d, %ebp
+; AVX512DQ-NEXT:    movl %r13d, %r15d
+; AVX512DQ-NEXT:    shll $27, %r15d
+; AVX512DQ-NEXT:    orl %ebp, %r15d
+; AVX512DQ-NEXT:    movl %r13d, %ebp
+; AVX512DQ-NEXT:    shll $28, %ebp
+; AVX512DQ-NEXT:    orl %r15d, %ebp
+; AVX512DQ-NEXT:    movl %r13d, %eax
+; AVX512DQ-NEXT:    shll $29, %eax
+; AVX512DQ-NEXT:    orl %ebp, %eax
+; AVX512DQ-NEXT:    kmovw %k1, %r15d
+; AVX512DQ-NEXT:    shll $30, %r13d
+; AVX512DQ-NEXT:    orl %eax, %r13d
+; AVX512DQ-NEXT:    kmovw %k0, %ebp
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512DQ-NEXT:    shll $31, %r11d
+; AVX512DQ-NEXT:    orl %r13d, %r11d
+; AVX512DQ-NEXT:    kmovw %k0, %r13d
+; AVX512DQ-NEXT:    movzbl %dil, %edi
+; AVX512DQ-NEXT:    orl %ebx, %r11d
+; AVX512DQ-NEXT:    movl %r11d, 8(%rsi)
+; AVX512DQ-NEXT:    movzbl %bpl, %r11d
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    leaq (%r11,%r11,2), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%r11,4), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%r11,8), %rax
+; AVX512DQ-NEXT:    movq %r11, %rbx
+; AVX512DQ-NEXT:    shlq $4, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $5, %r11
+; AVX512DQ-NEXT:    orq %rbx, %r11
+; AVX512DQ-NEXT:    movzbl %r9b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $6, %r9
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $7, %rbx
+; AVX512DQ-NEXT:    orq %r9, %rbx
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $8, %r9
+; AVX512DQ-NEXT:    orq %rbx, %r9
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $9, %rbx
+; AVX512DQ-NEXT:    orq %r9, %rbx
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $10, %r9
+; AVX512DQ-NEXT:    orq %rbx, %r9
+; AVX512DQ-NEXT:    shlq $11, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movzbl %r8b, %r8d
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $12, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $13, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $14, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $15, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $16, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    shlq $17, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    movzbl %dl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $18, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $19, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $20, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $21, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $22, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $24, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $25, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $26, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $27, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $28, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    shlq $29, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    movzbl %cl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $30, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $32, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $34, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $35, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $37, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $39, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %rcx
+; AVX512DQ-NEXT:    shlq $43, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r14, %rcx
+; AVX512DQ-NEXT:    shlq $45, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $47, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $49, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $51, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $53, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $55, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $57, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $58, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $59, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $60, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r10, %rcx
+; AVX512DQ-NEXT:    shlq $61, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $62, %r10
+; AVX512DQ-NEXT:    orq %rcx, %r10
+; AVX512DQ-NEXT:    shlq $63, %rdi
+; AVX512DQ-NEXT:    orq %r10, %rdi
+; AVX512DQ-NEXT:    orq %r11, %rdi
+; AVX512DQ-NEXT:    movq %rdi, (%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor6_vf16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r13
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrw $3, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrw $11, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrw $13, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movl %edi, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    leal (%r10,%r10,2), %ebx
+; AVX512BW-NEXT:    leal (%rbx,%r12,4), %ebx
+; AVX512BW-NEXT:    leal (%rbx,%r12,8), %ebx
+; AVX512BW-NEXT:    movl %r12d, %r13d
+; AVX512BW-NEXT:    shll $4, %r13d
+; AVX512BW-NEXT:    orl %ebx, %r13d
+; AVX512BW-NEXT:    movl %r12d, %ebx
+; AVX512BW-NEXT:    shll $5, %ebx
+; AVX512BW-NEXT:    orl %r13d, %ebx
+; AVX512BW-NEXT:    movl %r12d, %r13d
+; AVX512BW-NEXT:    shll $6, %r13d
+; AVX512BW-NEXT:    shll $7, %r12d
+; AVX512BW-NEXT:    orl %r13d, %r12d
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movl %r14d, %r13d
+; AVX512BW-NEXT:    shll $8, %r13d
+; AVX512BW-NEXT:    orl %r12d, %r13d
+; AVX512BW-NEXT:    movl %r14d, %r12d
+; AVX512BW-NEXT:    shll $9, %r12d
+; AVX512BW-NEXT:    orl %r13d, %r12d
+; AVX512BW-NEXT:    movl %r14d, %r13d
+; AVX512BW-NEXT:    shll $10, %r13d
+; AVX512BW-NEXT:    orl %r12d, %r13d
+; AVX512BW-NEXT:    movl %r14d, %r12d
+; AVX512BW-NEXT:    shll $11, %r12d
+; AVX512BW-NEXT:    orl %r13d, %r12d
+; AVX512BW-NEXT:    movl %r14d, %r13d
+; AVX512BW-NEXT:    shll $12, %r13d
+; AVX512BW-NEXT:    orl %r12d, %r13d
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512BW-NEXT:    shll $13, %r14d
+; AVX512BW-NEXT:    orl %r13d, %r14d
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movl %ebp, %r13d
+; AVX512BW-NEXT:    shll $14, %r13d
+; AVX512BW-NEXT:    orl %r14d, %r13d
+; AVX512BW-NEXT:    movl %ebp, %r14d
+; AVX512BW-NEXT:    shll $15, %r14d
+; AVX512BW-NEXT:    orl %r13d, %r14d
+; AVX512BW-NEXT:    movl %ebp, %r13d
+; AVX512BW-NEXT:    shll $16, %r13d
+; AVX512BW-NEXT:    orl %r14d, %r13d
+; AVX512BW-NEXT:    movl %ebp, %r14d
+; AVX512BW-NEXT:    shll $17, %r14d
+; AVX512BW-NEXT:    orl %r13d, %r14d
+; AVX512BW-NEXT:    movl %ebp, %r13d
+; AVX512BW-NEXT:    shll $18, %r13d
+; AVX512BW-NEXT:    orl %r14d, %r13d
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512BW-NEXT:    shll $19, %ebp
+; AVX512BW-NEXT:    orl %r13d, %ebp
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movl %r15d, %r13d
+; AVX512BW-NEXT:    shll $20, %r13d
+; AVX512BW-NEXT:    orl %ebp, %r13d
+; AVX512BW-NEXT:    movl %r15d, %ebp
+; AVX512BW-NEXT:    shll $21, %ebp
+; AVX512BW-NEXT:    orl %r13d, %ebp
+; AVX512BW-NEXT:    movl %r15d, %r13d
+; AVX512BW-NEXT:    shll $22, %r13d
+; AVX512BW-NEXT:    orl %ebp, %r13d
+; AVX512BW-NEXT:    movl %r15d, %ebp
+; AVX512BW-NEXT:    shll $23, %ebp
+; AVX512BW-NEXT:    orl %r13d, %ebp
+; AVX512BW-NEXT:    movl %r15d, %r13d
+; AVX512BW-NEXT:    shll $24, %r13d
+; AVX512BW-NEXT:    orl %ebp, %r13d
+; AVX512BW-NEXT:    shll $25, %r15d
+; AVX512BW-NEXT:    orl %r13d, %r15d
+; AVX512BW-NEXT:    movl %r11d, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movl %r13d, %ebp
+; AVX512BW-NEXT:    shll $26, %ebp
+; AVX512BW-NEXT:    orl %r15d, %ebp
+; AVX512BW-NEXT:    movl %r13d, %r15d
+; AVX512BW-NEXT:    shll $27, %r15d
+; AVX512BW-NEXT:    orl %ebp, %r15d
+; AVX512BW-NEXT:    movl %r13d, %ebp
+; AVX512BW-NEXT:    shll $28, %ebp
+; AVX512BW-NEXT:    orl %r15d, %ebp
+; AVX512BW-NEXT:    movl %r13d, %eax
+; AVX512BW-NEXT:    shll $29, %eax
+; AVX512BW-NEXT:    orl %ebp, %eax
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    shll $30, %r13d
+; AVX512BW-NEXT:    orl %eax, %r13d
+; AVX512BW-NEXT:    kmovd %k0, %ebp
+; AVX512BW-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512BW-NEXT:    shll $31, %r11d
+; AVX512BW-NEXT:    orl %r13d, %r11d
+; AVX512BW-NEXT:    kmovd %k0, %r13d
+; AVX512BW-NEXT:    movzbl %dil, %edi
+; AVX512BW-NEXT:    orl %ebx, %r11d
+; AVX512BW-NEXT:    movl %r11d, 8(%rsi)
+; AVX512BW-NEXT:    movzbl %bpl, %r11d
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    leaq (%r11,%r11,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r11,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r11,8), %rax
+; AVX512BW-NEXT:    movq %r11, %rbx
+; AVX512BW-NEXT:    shlq $4, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    shlq $5, %r11
+; AVX512BW-NEXT:    orq %rbx, %r11
+; AVX512BW-NEXT:    movzbl %r9b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $6, %r9
+; AVX512BW-NEXT:    movq %rax, %rbx
+; AVX512BW-NEXT:    shlq $7, %rbx
+; AVX512BW-NEXT:    orq %r9, %rbx
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $8, %r9
+; AVX512BW-NEXT:    orq %rbx, %r9
+; AVX512BW-NEXT:    movq %rax, %rbx
+; AVX512BW-NEXT:    shlq $9, %rbx
+; AVX512BW-NEXT:    orq %r9, %rbx
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $10, %r9
+; AVX512BW-NEXT:    orq %rbx, %r9
+; AVX512BW-NEXT:    shlq $11, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movzbl %r8b, %r8d
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $12, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $13, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $14, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $15, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $16, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    shlq $17, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    movzbl %dl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $18, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $19, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $20, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $21, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $22, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    shlq $23, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %r8
+; AVX512BW-NEXT:    shlq $24, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $25, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdx, %r8
+; AVX512BW-NEXT:    shlq $26, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $27, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdx, %r8
+; AVX512BW-NEXT:    shlq $28, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    shlq $29, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    movzbl %cl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $30, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $32, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $34, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $35, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $37, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $39, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r14, %rcx
+; AVX512BW-NEXT:    shlq $43, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r14, %rcx
+; AVX512BW-NEXT:    shlq $45, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $47, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $49, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $51, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $53, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $55, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $57, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $59, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r10, %rcx
+; AVX512BW-NEXT:    shlq $61, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $62, %r10
+; AVX512BW-NEXT:    orq %rcx, %r10
+; AVX512BW-NEXT:    shlq $63, %rdi
+; AVX512BW-NEXT:    orq %r10, %rdi
+; AVX512BW-NEXT:    orq %r11, %rdi
+; AVX512BW-NEXT:    movq %rdi, (%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r13
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <16 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  store <96 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor6_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor6_vf32:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r13
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    leaq (%r14,%r14,2), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%r14,4), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%r14,8), %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %r13
+; AVX512F-ONLY-NEXT:    shlq $4, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $5, %r14
+; AVX512F-ONLY-NEXT:    orq %r13, %r14
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $6, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $7, %rbp
+; AVX512F-ONLY-NEXT:    orq %r12, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $8, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $9, %rbp
+; AVX512F-ONLY-NEXT:    orq %r12, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $10, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $11, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $12, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $13, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $14, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $15, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $16, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $17, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $18, %rbp
+; AVX512F-ONLY-NEXT:    orq %r12, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $19, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $20, %rbp
+; AVX512F-ONLY-NEXT:    orq %r12, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $21, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $22, %rbp
+; AVX512F-ONLY-NEXT:    orq %r12, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $23, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $24, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $25, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $26, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $27, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $28, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $29, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $30, %rbp
+; AVX512F-ONLY-NEXT:    orq %r12, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $31, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rbp
+; AVX512F-ONLY-NEXT:    shlq $32, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $33, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rbp
+; AVX512F-ONLY-NEXT:    shlq $34, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $35, %r15
+; AVX512F-ONLY-NEXT:    orq %rbp, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $36, %rbp
+; AVX512F-ONLY-NEXT:    orq %r15, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $37, %r15
+; AVX512F-ONLY-NEXT:    orq %rbp, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $38, %rbp
+; AVX512F-ONLY-NEXT:    orq %r15, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $39, %r15
+; AVX512F-ONLY-NEXT:    orq %rbp, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $40, %rbp
+; AVX512F-ONLY-NEXT:    orq %r15, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $41, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rbp
+; AVX512F-ONLY-NEXT:    shlq $42, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $43, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rbp
+; AVX512F-ONLY-NEXT:    shlq $44, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $45, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rbp
+; AVX512F-ONLY-NEXT:    shlq $46, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $47, %r15
+; AVX512F-ONLY-NEXT:    orq %rbp, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $48, %rbp
+; AVX512F-ONLY-NEXT:    orq %r15, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $49, %r15
+; AVX512F-ONLY-NEXT:    orq %rbp, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $50, %rbp
+; AVX512F-ONLY-NEXT:    orq %r15, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $51, %r15
+; AVX512F-ONLY-NEXT:    orq %rbp, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $52, %rbp
+; AVX512F-ONLY-NEXT:    orq %r15, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $53, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rbp
+; AVX512F-ONLY-NEXT:    shlq $54, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $55, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rbp
+; AVX512F-ONLY-NEXT:    shlq $56, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $57, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rbp
+; AVX512F-ONLY-NEXT:    shlq $58, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $59, %r15
+; AVX512F-ONLY-NEXT:    orq %rbp, %r15
+; AVX512F-ONLY-NEXT:    movl %r12d, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $60, %rbp
+; AVX512F-ONLY-NEXT:    orq %r15, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $61, %r15
+; AVX512F-ONLY-NEXT:    orq %rbp, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %eax
+; AVX512F-ONLY-NEXT:    shlq $62, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $63, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k2
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512F-ONLY-NEXT:    movq %r15, (%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r12,4), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r12,8), %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $4, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $5, %r14
+; AVX512F-ONLY-NEXT:    orq %r15, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    shlq $7, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $9, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $11, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $13, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $15, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $17, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $18, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $19, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $20, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $21, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $22, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $23, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $24, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $25, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movzbl %bl, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %r13
+; AVX512F-ONLY-NEXT:    shlq $26, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $27, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r13
+; AVX512F-ONLY-NEXT:    shlq $28, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $29, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r13
+; AVX512F-ONLY-NEXT:    shlq $30, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $31, %rbx
+; AVX512F-ONLY-NEXT:    orq %r13, %rbx
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rbx
+; AVX512F-ONLY-NEXT:    shlq $33, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rbx
+; AVX512F-ONLY-NEXT:    shlq $35, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $36, %rbp
+; AVX512F-ONLY-NEXT:    orq %rbx, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $37, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $39, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rbp
+; AVX512F-ONLY-NEXT:    shlq $41, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $42, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $43, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $45, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $47, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $49, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r11
+; AVX512F-ONLY-NEXT:    shlq $50, %r11
+; AVX512F-ONLY-NEXT:    orq %r12, %r11
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %r11, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r11
+; AVX512F-ONLY-NEXT:    shlq $52, %r11
+; AVX512F-ONLY-NEXT:    orq %r12, %r11
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $53, %r12
+; AVX512F-ONLY-NEXT:    orq %r11, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $54, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %r12
+; AVX512F-ONLY-NEXT:    shlq $56, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r12
+; AVX512F-ONLY-NEXT:    shlq $58, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $59, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r13
+; AVX512F-ONLY-NEXT:    shlq $60, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r12d
+; AVX512F-ONLY-NEXT:    shlq $61, %rbx
+; AVX512F-ONLY-NEXT:    orq %r13, %rbx
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %eax
+; AVX512F-ONLY-NEXT:    # kill: def $r10d killed $r10d def $r10
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    shlq $62, %r10
+; AVX512F-ONLY-NEXT:    orq %rbx, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $63, %rbx
+; AVX512F-ONLY-NEXT:    orq %r10, %rbx
+; AVX512F-ONLY-NEXT:    orq %r14, %rbx
+; AVX512F-ONLY-NEXT:    movq %rbx, 8(%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r10
+; AVX512F-ONLY-NEXT:    leaq (%r10,%rax,4), %r10
+; AVX512F-ONLY-NEXT:    leaq (%r10,%rax,8), %rax
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rbx
+; AVX512F-ONLY-NEXT:    shlq $4, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r10, %r9
+; AVX512F-ONLY-NEXT:    shlq $5, %r9
+; AVX512F-ONLY-NEXT:    orq %rbx, %r9
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %rbx
+; AVX512F-ONLY-NEXT:    shlq $7, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $9, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $10, %r8
+; AVX512F-ONLY-NEXT:    orq %r10, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $11, %r10
+; AVX512F-ONLY-NEXT:    orq %r8, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $12, %r8
+; AVX512F-ONLY-NEXT:    orq %r10, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $13, %r10
+; AVX512F-ONLY-NEXT:    orq %r8, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $14, %r8
+; AVX512F-ONLY-NEXT:    orq %r10, %r8
+; AVX512F-ONLY-NEXT:    shlq $15, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $16, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $17, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $18, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $19, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $20, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $21, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $22, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $23, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $24, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $25, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $26, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $27, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movzbl %cl, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $28, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $29, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $30, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $31, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %rcx
+; AVX512F-ONLY-NEXT:    shlq $35, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %rcx
+; AVX512F-ONLY-NEXT:    shlq $37, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $39, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %rbp, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $41, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $43, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $45, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %rcx
+; AVX512F-ONLY-NEXT:    shlq $47, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %rcx
+; AVX512F-ONLY-NEXT:    shlq $49, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $51, %r11
+; AVX512F-ONLY-NEXT:    orq %rax, %r11
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %r11, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $53, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $55, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movl %edx, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $58, %rcx
+; AVX512F-ONLY-NEXT:    orq %r12, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $59, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $60, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $61, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $62, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movzbl %dl, %ecx
+; AVX512F-ONLY-NEXT:    shlq $63, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    orq %r9, %rcx
+; AVX512F-ONLY-NEXT:    movq %rcx, 16(%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r13
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor6_vf32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %edx
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %ecx
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %edi
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r8d
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r9d
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r10d
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r11d
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %ebx
+; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k1, %r14d
+; AVX512DQ-NEXT:    movzbl %r14b, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    leaq (%r14,%r14,2), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%r14,4), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%r14,8), %r12
+; AVX512DQ-NEXT:    movq %r14, %r13
+; AVX512DQ-NEXT:    shlq $4, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    shlq $5, %r14
+; AVX512DQ-NEXT:    orq %r13, %r14
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $6, %r12
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $7, %rbp
+; AVX512DQ-NEXT:    orq %r12, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $8, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $9, %rbp
+; AVX512DQ-NEXT:    orq %r12, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $10, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512DQ-NEXT:    shlq $11, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $12, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $13, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $14, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $15, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $16, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    kmovw %k2, %r13d
+; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512DQ-NEXT:    shlq $17, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    movzbl %r13b, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $18, %rbp
+; AVX512DQ-NEXT:    orq %r12, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $19, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $20, %rbp
+; AVX512DQ-NEXT:    orq %r12, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $21, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $22, %rbp
+; AVX512DQ-NEXT:    orq %r12, %rbp
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512DQ-NEXT:    shlq $23, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movzbl %r12b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $24, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $25, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $26, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $27, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $28, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $29, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    movzbl %r15b, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $30, %rbp
+; AVX512DQ-NEXT:    orq %r12, %rbp
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512DQ-NEXT:    shlq $31, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rbp
+; AVX512DQ-NEXT:    shlq $32, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $33, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r15, %rbp
+; AVX512DQ-NEXT:    shlq $34, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    kmovw %k2, %r13d
+; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512DQ-NEXT:    shlq $35, %r15
+; AVX512DQ-NEXT:    orq %rbp, %r15
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $36, %rbp
+; AVX512DQ-NEXT:    orq %r15, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $37, %r15
+; AVX512DQ-NEXT:    orq %rbp, %r15
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $38, %rbp
+; AVX512DQ-NEXT:    orq %r15, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $39, %r15
+; AVX512DQ-NEXT:    orq %rbp, %r15
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $40, %rbp
+; AVX512DQ-NEXT:    orq %r15, %rbp
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512DQ-NEXT:    shlq $41, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rbp
+; AVX512DQ-NEXT:    shlq $42, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $43, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r15, %rbp
+; AVX512DQ-NEXT:    shlq $44, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $45, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r15, %rbp
+; AVX512DQ-NEXT:    shlq $46, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    kmovw %k2, %r13d
+; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512DQ-NEXT:    shlq $47, %r15
+; AVX512DQ-NEXT:    orq %rbp, %r15
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $48, %rbp
+; AVX512DQ-NEXT:    orq %r15, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $49, %r15
+; AVX512DQ-NEXT:    orq %rbp, %r15
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $50, %rbp
+; AVX512DQ-NEXT:    orq %r15, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $51, %r15
+; AVX512DQ-NEXT:    orq %rbp, %r15
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $52, %rbp
+; AVX512DQ-NEXT:    orq %r15, %rbp
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k2
+; AVX512DQ-NEXT:    shlq $53, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rbp
+; AVX512DQ-NEXT:    shlq $54, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $55, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r15, %rbp
+; AVX512DQ-NEXT:    shlq $56, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $57, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r15, %rbp
+; AVX512DQ-NEXT:    shlq $58, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $59, %r15
+; AVX512DQ-NEXT:    orq %rbp, %r15
+; AVX512DQ-NEXT:    movl %r12d, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $60, %rbp
+; AVX512DQ-NEXT:    orq %r15, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $61, %r15
+; AVX512DQ-NEXT:    orq %rbp, %r15
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512DQ-NEXT:    movzbl %r12b, %eax
+; AVX512DQ-NEXT:    shlq $62, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $63, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    kmovw %k2, %r13d
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k2
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k1
+; AVX512DQ-NEXT:    movq %r15, (%rsi)
+; AVX512DQ-NEXT:    movzbl %r14b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%r12,4), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%r12,8), %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $4, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $5, %r14
+; AVX512DQ-NEXT:    orq %r15, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    shlq $7, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %r13b, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $9, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $11, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512DQ-NEXT:    shlq $13, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movzbl %r12b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $15, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $17, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $18, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %r15d
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512DQ-NEXT:    shlq $19, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $20, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $21, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $22, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $23, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $24, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512DQ-NEXT:    shlq $25, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movzbl %bl, %ebx
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %r13
+; AVX512DQ-NEXT:    shlq $26, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $27, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r13
+; AVX512DQ-NEXT:    shlq $28, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $29, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r13
+; AVX512DQ-NEXT:    shlq $30, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    shlq $31, %rbx
+; AVX512DQ-NEXT:    orq %r13, %rbx
+; AVX512DQ-NEXT:    kmovw %k0, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rbx
+; AVX512DQ-NEXT:    shlq $33, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rbx
+; AVX512DQ-NEXT:    shlq $35, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $36, %rbp
+; AVX512DQ-NEXT:    orq %rbx, %rbp
+; AVX512DQ-NEXT:    kmovw %k1, %ebx
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k1
+; AVX512DQ-NEXT:    shlq $37, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $39, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r15, %rbp
+; AVX512DQ-NEXT:    shlq $41, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $42, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512DQ-NEXT:    shlq $43, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $45, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $47, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %r15d
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k1
+; AVX512DQ-NEXT:    shlq $49, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %r11b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r11
+; AVX512DQ-NEXT:    shlq $50, %r11
+; AVX512DQ-NEXT:    orq %r12, %r11
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %r11, %r12
+; AVX512DQ-NEXT:    movq %rax, %r11
+; AVX512DQ-NEXT:    shlq $52, %r11
+; AVX512DQ-NEXT:    orq %r12, %r11
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $53, %r12
+; AVX512DQ-NEXT:    orq %r11, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $54, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %r11d
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %r12
+; AVX512DQ-NEXT:    shlq $56, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r12
+; AVX512DQ-NEXT:    shlq $58, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $59, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r13
+; AVX512DQ-NEXT:    shlq $60, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k0, %r12d
+; AVX512DQ-NEXT:    shlq $61, %rbx
+; AVX512DQ-NEXT:    orq %r13, %rbx
+; AVX512DQ-NEXT:    movzbl %r10b, %eax
+; AVX512DQ-NEXT:    # kill: def $r10d killed $r10d def $r10
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    shlq $62, %r10
+; AVX512DQ-NEXT:    orq %rbx, %r10
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $63, %rbx
+; AVX512DQ-NEXT:    orq %r10, %rbx
+; AVX512DQ-NEXT:    orq %r14, %rbx
+; AVX512DQ-NEXT:    movq %rbx, 8(%rsi)
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r10
+; AVX512DQ-NEXT:    leaq (%r10,%rax,4), %r10
+; AVX512DQ-NEXT:    leaq (%r10,%rax,8), %rax
+; AVX512DQ-NEXT:    movzbl %r9b, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rbx
+; AVX512DQ-NEXT:    shlq $4, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r10, %r9
+; AVX512DQ-NEXT:    shlq $5, %r9
+; AVX512DQ-NEXT:    orq %rbx, %r9
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movq %r10, %rbx
+; AVX512DQ-NEXT:    shlq $7, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $9, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movzbl %r8b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $10, %r8
+; AVX512DQ-NEXT:    orq %r10, %r8
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $11, %r10
+; AVX512DQ-NEXT:    orq %r8, %r10
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $12, %r8
+; AVX512DQ-NEXT:    orq %r10, %r8
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $13, %r10
+; AVX512DQ-NEXT:    orq %r8, %r10
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $14, %r8
+; AVX512DQ-NEXT:    orq %r10, %r8
+; AVX512DQ-NEXT:    shlq $15, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movzbl %dil, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $16, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $17, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $18, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $19, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $20, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    shlq $21, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $22, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $23, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $24, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $25, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $26, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $27, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movzbl %cl, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $28, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $29, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $30, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    shlq $31, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $33, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %rbp, %rcx
+; AVX512DQ-NEXT:    shlq $35, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %rbp, %rcx
+; AVX512DQ-NEXT:    shlq $37, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $39, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %rbp, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $41, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $43, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $45, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r11, %rcx
+; AVX512DQ-NEXT:    shlq $47, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r11, %rcx
+; AVX512DQ-NEXT:    shlq $49, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $51, %r11
+; AVX512DQ-NEXT:    orq %rax, %r11
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %r11, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $53, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $55, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $57, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movl %edx, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $58, %rcx
+; AVX512DQ-NEXT:    orq %r12, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $59, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $60, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $61, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $62, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movzbl %dl, %ecx
+; AVX512DQ-NEXT:    shlq $63, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    orq %r9, %rcx
+; AVX512DQ-NEXT:    movq %rcx, 16(%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor6_vf32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r13
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrd $31, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrd $26, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrd $25, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrd $24, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrd $23, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrd $22, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrd $21, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrd $15, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrd $14, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrd $5, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $1, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $2, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k0, %ebx
+; AVX512BW-NEXT:    movzbl %bl, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    leaq (%rbx,%rbx,2), %r12
+; AVX512BW-NEXT:    leaq (%r12,%rbx,4), %r12
+; AVX512BW-NEXT:    leaq (%r12,%rbx,8), %r12
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $4, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    shlq $5, %rbx
+; AVX512BW-NEXT:    orq %r13, %rbx
+; AVX512BW-NEXT:    movzbl %r15b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $6, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $7, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $8, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $9, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $10, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrd $3, %k0, %k1
+; AVX512BW-NEXT:    shlq $11, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %r13b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $12, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $13, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $14, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $15, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $16, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $4, %k0, %k1
+; AVX512BW-NEXT:    shlq $17, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movzbl %r12b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $18, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $19, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $20, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $21, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $22, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $10, %k0, %k1
+; AVX512BW-NEXT:    shlq $23, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %r15b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $24, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $25, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $26, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $27, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $28, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    shlq $29, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movzbl %r14b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $30, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $6, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $32, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $33, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $34, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $7, %k0, %k1
+; AVX512BW-NEXT:    shlq $35, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $36, %r13
+; AVX512BW-NEXT:    orq %r14, %r13
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $37, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $38, %r13
+; AVX512BW-NEXT:    orq %r14, %r13
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $39, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $40, %r13
+; AVX512BW-NEXT:    orq %r14, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $8, %k0, %k1
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $42, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $43, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $44, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $45, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $46, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $9, %k0, %k1
+; AVX512BW-NEXT:    shlq $47, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $48, %r13
+; AVX512BW-NEXT:    orq %r14, %r13
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $49, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $50, %r13
+; AVX512BW-NEXT:    orq %r14, %r13
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $51, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $52, %r13
+; AVX512BW-NEXT:    orq %r14, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $13, %k0, %k1
+; AVX512BW-NEXT:    shlq $53, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $54, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $55, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $56, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $57, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $58, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    shlq $59, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    movl %r15d, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $60, %r13
+; AVX512BW-NEXT:    orq %r14, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $61, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $12, %k0, %k1
+; AVX512BW-NEXT:    movzbl %r15b, %r15d
+; AVX512BW-NEXT:    shlq $62, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $63, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrd $11, %k0, %k1
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512BW-NEXT:    movq %rax, (%rsi)
+; AVX512BW-NEXT:    movzbl %bl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    leaq (%r15,%r15,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r12,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r12,8), %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $4, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rbx
+; AVX512BW-NEXT:    shlq $5, %rbx
+; AVX512BW-NEXT:    orq %r15, %rbx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %r13b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $12, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrd $17, %k0, %k1
+; AVX512BW-NEXT:    shlq $13, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %r14b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    shlq $14, %r14
+; AVX512BW-NEXT:    orq %r15, %r14
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    shlq $15, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    shlq $16, %r14
+; AVX512BW-NEXT:    orq %r15, %r14
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    shlq $17, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $18, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $18, %k0, %k1
+; AVX512BW-NEXT:    shlq $19, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $20, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $21, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $22, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $23, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %rbp
+; AVX512BW-NEXT:    shlq $24, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $19, %k0, %k1
+; AVX512BW-NEXT:    shlq $25, %r12
+; AVX512BW-NEXT:    orq %rbp, %r12
+; AVX512BW-NEXT:    movzbl %r11b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r11
+; AVX512BW-NEXT:    shlq $26, %r11
+; AVX512BW-NEXT:    orq %r12, %r11
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $27, %r12
+; AVX512BW-NEXT:    orq %r11, %r12
+; AVX512BW-NEXT:    movq %rax, %r11
+; AVX512BW-NEXT:    shlq $28, %r11
+; AVX512BW-NEXT:    orq %r12, %r11
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $29, %r12
+; AVX512BW-NEXT:    orq %r11, %r12
+; AVX512BW-NEXT:    movq %rax, %rbp
+; AVX512BW-NEXT:    shlq $30, %rbp
+; AVX512BW-NEXT:    orq %r12, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrd $20, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $32, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $34, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $36, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $27, %k0, %k1
+; AVX512BW-NEXT:    shlq $37, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $39, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r14, %rbp
+; AVX512BW-NEXT:    shlq $41, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $42, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrd $28, %k0, %k1
+; AVX512BW-NEXT:    shlq $43, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r15, %r14
+; AVX512BW-NEXT:    shlq $45, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r15, %r14
+; AVX512BW-NEXT:    shlq $47, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrd $29, %k0, %k1
+; AVX512BW-NEXT:    shlq $49, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r11, %r15
+; AVX512BW-NEXT:    shlq $51, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r11, %r15
+; AVX512BW-NEXT:    shlq $53, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $30, %k0, %k0
+; AVX512BW-NEXT:    shlq $55, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r12, %r11
+; AVX512BW-NEXT:    shlq $57, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r12, %r11
+; AVX512BW-NEXT:    shlq $59, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    kmovd %k0, %r11d
+; AVX512BW-NEXT:    shlq $61, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %r10b, %eax
+; AVX512BW-NEXT:    # kill: def $r10d killed $r10d def $r10
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    shlq $62, %r10
+; AVX512BW-NEXT:    orq %r12, %r10
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $63, %r12
+; AVX512BW-NEXT:    orq %r10, %r12
+; AVX512BW-NEXT:    orq %rbx, %r12
+; AVX512BW-NEXT:    movq %r12, 8(%rsi)
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r10
+; AVX512BW-NEXT:    leaq (%r10,%rax,4), %r10
+; AVX512BW-NEXT:    leaq (%r10,%rax,8), %rax
+; AVX512BW-NEXT:    movzbl %r9b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %rbx
+; AVX512BW-NEXT:    shlq $4, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r10, %r9
+; AVX512BW-NEXT:    shlq $5, %r9
+; AVX512BW-NEXT:    orq %rbx, %r9
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r10, %rbx
+; AVX512BW-NEXT:    shlq $7, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    shlq $9, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movzbl %r8b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $10, %r8
+; AVX512BW-NEXT:    orq %r10, %r8
+; AVX512BW-NEXT:    movq %rax, %r10
+; AVX512BW-NEXT:    shlq $11, %r10
+; AVX512BW-NEXT:    orq %r8, %r10
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $12, %r8
+; AVX512BW-NEXT:    orq %r10, %r8
+; AVX512BW-NEXT:    movq %rax, %r10
+; AVX512BW-NEXT:    shlq $13, %r10
+; AVX512BW-NEXT:    orq %r8, %r10
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $14, %r8
+; AVX512BW-NEXT:    orq %r10, %r8
+; AVX512BW-NEXT:    shlq $15, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movzbl %dil, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $16, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $17, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $18, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $19, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $20, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    shlq $21, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $22, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $23, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $24, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $25, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $26, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    shlq $27, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movzbl %cl, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $28, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $29, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $30, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    shlq $31, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    shlq $33, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rcx
+; AVX512BW-NEXT:    shlq $35, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rcx
+; AVX512BW-NEXT:    shlq $37, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $39, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %r14, %rcx
+; AVX512BW-NEXT:    shlq $41, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r14, %rcx
+; AVX512BW-NEXT:    shlq $43, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $45, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $47, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $49, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $51, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r11, %rcx
+; AVX512BW-NEXT:    shlq $53, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r11, %rcx
+; AVX512BW-NEXT:    shlq $55, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $57, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movl %edx, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $58, %rcx
+; AVX512BW-NEXT:    orq %r11, %rcx
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $59, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $60, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $61, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $62, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movzbl %dl, %ecx
+; AVX512BW-NEXT:    shlq $63, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    orq %r9, %rcx
+; AVX512BW-NEXT:    movq %rcx, 16(%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r13
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <32 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  store <192 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor6_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r13
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k3
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k2
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k2, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    leaq (%r14,%r14,2), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%r14,4), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%r14,8), %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %r13
+; AVX512F-ONLY-NEXT:    shlq $4, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $5, %r14
+; AVX512F-ONLY-NEXT:    orq %r13, %r14
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $6, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $7, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $8, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $9, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $10, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $11, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $12, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $13, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $14, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $15, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $16, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $17, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $18, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $19, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $20, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $21, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $22, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $23, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $24, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $25, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $26, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $27, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $28, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $29, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $30, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $31, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $32, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $33, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $34, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $35, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $36, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $37, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $38, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $39, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $40, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $41, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $42, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $43, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $44, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $45, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $46, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $47, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $48, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $49, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $50, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $52, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $53, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $54, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $55, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $56, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $57, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $58, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $59, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r15d
+; AVX512F-ONLY-NEXT:    movl %ebp, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $60, %rbp
+; AVX512F-ONLY-NEXT:    orq %r12, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $62, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $63, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    orq %r14, %r12
+; AVX512F-ONLY-NEXT:    movq %r12, 24(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r14d
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    leaq (%r12,%r12,2), %r14
+; AVX512F-ONLY-NEXT:    leaq (%r14,%r12,4), %r14
+; AVX512F-ONLY-NEXT:    leaq (%r14,%r12,8), %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $4, %r13
+; AVX512F-ONLY-NEXT:    orq %r14, %r13
+; AVX512F-ONLY-NEXT:    shlq $5, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r14
+; AVX512F-ONLY-NEXT:    shlq $6, %r14
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $7, %rbp
+; AVX512F-ONLY-NEXT:    orq %r14, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r14
+; AVX512F-ONLY-NEXT:    shlq $8, %r14
+; AVX512F-ONLY-NEXT:    orq %rbp, %r14
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $9, %rbp
+; AVX512F-ONLY-NEXT:    orq %r14, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r14
+; AVX512F-ONLY-NEXT:    shlq $10, %r14
+; AVX512F-ONLY-NEXT:    orq %rbp, %r14
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $11, %r13
+; AVX512F-ONLY-NEXT:    orq %r14, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rbp
+; AVX512F-ONLY-NEXT:    shlq $12, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r14, %r13
+; AVX512F-ONLY-NEXT:    shlq $13, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r14, %rbp
+; AVX512F-ONLY-NEXT:    shlq $14, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r14, %r13
+; AVX512F-ONLY-NEXT:    shlq $15, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r14, %rbp
+; AVX512F-ONLY-NEXT:    shlq $16, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $17, %r14
+; AVX512F-ONLY-NEXT:    orq %rbp, %r14
+; AVX512F-ONLY-NEXT:    movzbl %r13b, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $18, %rbp
+; AVX512F-ONLY-NEXT:    orq %r14, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r14
+; AVX512F-ONLY-NEXT:    shlq $19, %r14
+; AVX512F-ONLY-NEXT:    orq %rbp, %r14
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $20, %rbp
+; AVX512F-ONLY-NEXT:    orq %r14, %rbp
+; AVX512F-ONLY-NEXT:    movq %r13, %r14
+; AVX512F-ONLY-NEXT:    shlq $21, %r14
+; AVX512F-ONLY-NEXT:    orq %rbp, %r14
+; AVX512F-ONLY-NEXT:    movq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $22, %rbp
+; AVX512F-ONLY-NEXT:    orq %r14, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $23, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rbp
+; AVX512F-ONLY-NEXT:    shlq $24, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r14, %r13
+; AVX512F-ONLY-NEXT:    shlq $25, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r14, %rbp
+; AVX512F-ONLY-NEXT:    shlq $26, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movq %r14, %r13
+; AVX512F-ONLY-NEXT:    shlq $27, %r13
+; AVX512F-ONLY-NEXT:    orq %rbp, %r13
+; AVX512F-ONLY-NEXT:    movq %r14, %rbp
+; AVX512F-ONLY-NEXT:    shlq $28, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    shlq $29, %r14
+; AVX512F-ONLY-NEXT:    orq %rbp, %r14
+; AVX512F-ONLY-NEXT:    movzbl %bl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $30, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $31, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r14
+; AVX512F-ONLY-NEXT:    shlq $33, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $35, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rbx
+; AVX512F-ONLY-NEXT:    shlq $37, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rbx
+; AVX512F-ONLY-NEXT:    shlq $39, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $41, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r14
+; AVX512F-ONLY-NEXT:    shlq $43, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r14
+; AVX512F-ONLY-NEXT:    shlq $45, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k3, %k3
+; AVX512F-ONLY-NEXT:    shlq $47, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rbx
+; AVX512F-ONLY-NEXT:    shlq $49, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rbx
+; AVX512F-ONLY-NEXT:    shlq $51, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k3
+; AVX512F-ONLY-NEXT:    shlq $53, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r14
+; AVX512F-ONLY-NEXT:    shlq $55, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r14
+; AVX512F-ONLY-NEXT:    shlq $57, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %rbx, %rax
+; AVX512F-ONLY-NEXT:    shlq $58, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $59, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movl %ebp, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r14
+; AVX512F-ONLY-NEXT:    shlq $60, %r14
+; AVX512F-ONLY-NEXT:    orq %rbx, %r14
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $61, %r13
+; AVX512F-ONLY-NEXT:    orq %r14, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k2, %k3
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %ebx
+; AVX512F-ONLY-NEXT:    shlq $62, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r13
+; AVX512F-ONLY-NEXT:    shlq $63, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k2, %k3
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k2, %k3
+; AVX512F-ONLY-NEXT:    movq %r13, (%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r15,2), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r13,4), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r13,8), %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $5, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $6, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $9, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $11, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $13, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $15, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $17, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k2, %k2
+; AVX512F-ONLY-NEXT:    shlq $19, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $20, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $21, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $22, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $23, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $24, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $25, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $26, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $27, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $28, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $29, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $30, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $31, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $33, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $35, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $37, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $39, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $43, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $45, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $47, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $49, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $53, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $55, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $57, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $58, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $59, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    movl %ebp, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    shlq $62, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $63, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %r12, 32(%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,8), %rax
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $5, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $9, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $11, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $13, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $15, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $17, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $19, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $20, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $21, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $22, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $23, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $24, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $25, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $26, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $27, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $28, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $29, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $30, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %r13
+; AVX512F-ONLY-NEXT:    shlq $32, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $33, %r14
+; AVX512F-ONLY-NEXT:    orq %r13, %r14
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $35, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $37, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $39, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $43, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-ONLY-NEXT:    shlq $45, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $47, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $49, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $53, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $55, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    movl %ebp, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $58, %r13
+; AVX512F-ONLY-NEXT:    orq %r14, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $59, %r14
+; AVX512F-ONLY-NEXT:    orq %r13, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $60, %r13
+; AVX512F-ONLY-NEXT:    orq %r14, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rbp
+; AVX512F-ONLY-NEXT:    shlq $61, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $62, %r12
+; AVX512F-ONLY-NEXT:    orq %rbp, %r12
+; AVX512F-ONLY-NEXT:    shlq $63, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %rax, 40(%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r15d
+; AVX512F-ONLY-NEXT:    movl %r15d, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%rax,4), %r12
+; AVX512F-ONLY-NEXT:    leaq (%r12,%rax,8), %rax
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %r11
+; AVX512F-ONLY-NEXT:    shlq $5, %r11
+; AVX512F-ONLY-NEXT:    orq %r12, %r11
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $9, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $11, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $13, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $15, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $17, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $19, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $20, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $21, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $22, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $23, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $24, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $25, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $26, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $27, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $28, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $29, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $30, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r13
+; AVX512F-ONLY-NEXT:    shlq $32, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $33, %r10
+; AVX512F-ONLY-NEXT:    orq %r13, %r10
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $35, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $37, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $39, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $43, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $45, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $47, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $49, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r10
+; AVX512F-ONLY-NEXT:    shlq $53, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r10
+; AVX512F-ONLY-NEXT:    shlq $55, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movl %ebp, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $58, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $59, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512F-ONLY-NEXT:    shlq $62, %r10
+; AVX512F-ONLY-NEXT:    orq %r12, %r10
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r12d
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %ebp
+; AVX512F-ONLY-NEXT:    shlq $63, %rbp
+; AVX512F-ONLY-NEXT:    orq %r10, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512F-ONLY-NEXT:    orq %r11, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r11d
+; AVX512F-ONLY-NEXT:    movq %rbp, 16(%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    leaq (%rbx,%rbx,2), %r9
+; AVX512F-ONLY-NEXT:    leaq (%r9,%rax,4), %r9
+; AVX512F-ONLY-NEXT:    leaq (%r9,%rax,8), %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $4, %rbx
+; AVX512F-ONLY-NEXT:    orq %r9, %rbx
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $5, %r9
+; AVX512F-ONLY-NEXT:    orq %rbx, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $6, %rbx
+; AVX512F-ONLY-NEXT:    shlq $7, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %rbx
+; AVX512F-ONLY-NEXT:    shlq $8, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $9, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %rbx
+; AVX512F-ONLY-NEXT:    shlq $10, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $11, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %rbx
+; AVX512F-ONLY-NEXT:    shlq $12, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $13, %r8
+; AVX512F-ONLY-NEXT:    orq %rbx, %r8
+; AVX512F-ONLY-NEXT:    movzbl %dil, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $14, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $15, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $16, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $17, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $18, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    shlq $19, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movzbl %dl, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $20, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $21, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $22, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $24, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $25, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movzbl %cl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $26, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $27, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $28, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $29, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $30, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rcx
+; AVX512F-ONLY-NEXT:    shlq $32, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %rcx
+; AVX512F-ONLY-NEXT:    shlq $34, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %rcx
+; AVX512F-ONLY-NEXT:    shlq $36, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $37, %r10
+; AVX512F-ONLY-NEXT:    orq %rcx, %r10
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $39, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $41, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $43, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $45, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $47, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $50, %rcx
+; AVX512F-ONLY-NEXT:    orq %r12, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $51, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $52, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $53, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $54, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    movq %r11, %rcx
+; AVX512F-ONLY-NEXT:    shlq $56, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %rcx
+; AVX512F-ONLY-NEXT:    shlq $58, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $59, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %rcx
+; AVX512F-ONLY-NEXT:    shlq $60, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $61, %r11
+; AVX512F-ONLY-NEXT:    orq %rcx, %r11
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    shlq $62, %r14
+; AVX512F-ONLY-NEXT:    orq %r11, %r14
+; AVX512F-ONLY-NEXT:    shlq $63, %r15
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    orq %r9, %r15
+; AVX512F-ONLY-NEXT:    movq %r15, 8(%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r13
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor6_vf64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k3
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k2
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k1
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512DQ-NEXT:    kshiftrw $15, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ecx
+; AVX512DQ-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %edx
+; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %edi
+; AVX512DQ-NEXT:    kshiftrw $12, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r8d
+; AVX512DQ-NEXT:    kshiftrw $11, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r9d
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r10d
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r11d
+; AVX512DQ-NEXT:    kshiftrw $5, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ebx
+; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k4
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    movzbl %r14b, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    leaq (%r14,%r14,2), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%r14,4), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%r14,8), %r12
+; AVX512DQ-NEXT:    movq %r14, %r13
+; AVX512DQ-NEXT:    shlq $4, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    shlq $5, %r14
+; AVX512DQ-NEXT:    orq %r13, %r14
+; AVX512DQ-NEXT:    movzbl %r15b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $6, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $7, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $8, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $9, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $10, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k4
+; AVX512DQ-NEXT:    shlq $11, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %r13b, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $12, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $13, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $14, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $15, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $16, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k4
+; AVX512DQ-NEXT:    shlq $17, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movzbl %r12b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $18, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $19, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $20, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $21, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $22, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k4
+; AVX512DQ-NEXT:    shlq $23, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %r15b, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $24, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $25, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $26, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $27, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $28, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k4
+; AVX512DQ-NEXT:    shlq $29, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movzbl %r12b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $30, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $31, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $32, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $33, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $34, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k4
+; AVX512DQ-NEXT:    shlq $35, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %r15b, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $36, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $37, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $38, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $39, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $40, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k4
+; AVX512DQ-NEXT:    shlq $41, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $42, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $43, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $44, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $45, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $46, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k4
+; AVX512DQ-NEXT:    shlq $47, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $48, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $49, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $50, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $52, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $1, %k3, %k4
+; AVX512DQ-NEXT:    shlq $53, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movzbl %r12b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $54, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $55, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $56, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $57, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $58, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    shlq $59, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %r15d
+; AVX512DQ-NEXT:    movl %ebp, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $60, %rbp
+; AVX512DQ-NEXT:    orq %r12, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
+; AVX512DQ-NEXT:    shlq $62, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $63, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    orq %r14, %r12
+; AVX512DQ-NEXT:    movq %r12, 24(%rsi)
+; AVX512DQ-NEXT:    kmovw %k3, %r14d
+; AVX512DQ-NEXT:    movzbl %r14b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    leaq (%r12,%r12,2), %r14
+; AVX512DQ-NEXT:    leaq (%r14,%r12,4), %r14
+; AVX512DQ-NEXT:    leaq (%r14,%r12,8), %r14
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $4, %r13
+; AVX512DQ-NEXT:    orq %r14, %r13
+; AVX512DQ-NEXT:    shlq $5, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r14
+; AVX512DQ-NEXT:    shlq $6, %r14
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $7, %rbp
+; AVX512DQ-NEXT:    orq %r14, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r14
+; AVX512DQ-NEXT:    shlq $8, %r14
+; AVX512DQ-NEXT:    orq %rbp, %r14
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $9, %rbp
+; AVX512DQ-NEXT:    orq %r14, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r14
+; AVX512DQ-NEXT:    shlq $10, %r14
+; AVX512DQ-NEXT:    orq %rbp, %r14
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512DQ-NEXT:    shlq $11, %r13
+; AVX512DQ-NEXT:    orq %r14, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rbp
+; AVX512DQ-NEXT:    shlq $12, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r14, %r13
+; AVX512DQ-NEXT:    shlq $13, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r14, %rbp
+; AVX512DQ-NEXT:    shlq $14, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r14, %r13
+; AVX512DQ-NEXT:    shlq $15, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r14, %rbp
+; AVX512DQ-NEXT:    shlq $16, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $4, %k3, %k4
+; AVX512DQ-NEXT:    shlq $17, %r14
+; AVX512DQ-NEXT:    orq %rbp, %r14
+; AVX512DQ-NEXT:    movzbl %r13b, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $18, %rbp
+; AVX512DQ-NEXT:    orq %r14, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r14
+; AVX512DQ-NEXT:    shlq $19, %r14
+; AVX512DQ-NEXT:    orq %rbp, %r14
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $20, %rbp
+; AVX512DQ-NEXT:    orq %r14, %rbp
+; AVX512DQ-NEXT:    movq %r13, %r14
+; AVX512DQ-NEXT:    shlq $21, %r14
+; AVX512DQ-NEXT:    orq %rbp, %r14
+; AVX512DQ-NEXT:    movq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $22, %rbp
+; AVX512DQ-NEXT:    orq %r14, %rbp
+; AVX512DQ-NEXT:    kmovw %k4, %r14d
+; AVX512DQ-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512DQ-NEXT:    shlq $23, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movzbl %r14b, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rbp
+; AVX512DQ-NEXT:    shlq $24, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r14, %r13
+; AVX512DQ-NEXT:    shlq $25, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r14, %rbp
+; AVX512DQ-NEXT:    shlq $26, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movq %r14, %r13
+; AVX512DQ-NEXT:    shlq $27, %r13
+; AVX512DQ-NEXT:    orq %rbp, %r13
+; AVX512DQ-NEXT:    movq %r14, %rbp
+; AVX512DQ-NEXT:    shlq $28, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    shlq $29, %r14
+; AVX512DQ-NEXT:    orq %rbp, %r14
+; AVX512DQ-NEXT:    movzbl %bl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $30, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512DQ-NEXT:    shlq $31, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r14
+; AVX512DQ-NEXT:    shlq $33, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %r14d
+; AVX512DQ-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512DQ-NEXT:    shlq $35, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r14, %rbx
+; AVX512DQ-NEXT:    shlq $37, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r14, %rbx
+; AVX512DQ-NEXT:    shlq $39, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %ebx
+; AVX512DQ-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512DQ-NEXT:    shlq $41, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r14
+; AVX512DQ-NEXT:    shlq $43, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r14
+; AVX512DQ-NEXT:    shlq $45, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %r14d
+; AVX512DQ-NEXT:    kshiftrw $9, %k3, %k3
+; AVX512DQ-NEXT:    shlq $47, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r14, %rbx
+; AVX512DQ-NEXT:    shlq $49, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r14, %rbx
+; AVX512DQ-NEXT:    shlq $51, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %ebx
+; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k3
+; AVX512DQ-NEXT:    shlq $53, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r14
+; AVX512DQ-NEXT:    shlq $55, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r14
+; AVX512DQ-NEXT:    shlq $57, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %rbx, %rax
+; AVX512DQ-NEXT:    shlq $58, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    shlq $59, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movl %ebp, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r14
+; AVX512DQ-NEXT:    shlq $60, %r14
+; AVX512DQ-NEXT:    orq %rbx, %r14
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $61, %r13
+; AVX512DQ-NEXT:    orq %r14, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %r14d
+; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k3
+; AVX512DQ-NEXT:    movzbl %bpl, %ebx
+; AVX512DQ-NEXT:    shlq $62, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %rbx, %r13
+; AVX512DQ-NEXT:    shlq $63, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k3
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k3
+; AVX512DQ-NEXT:    movq %r13, (%rsi)
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    leaq (%r15,%r15,2), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%r13,4), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%r13,8), %r15
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $5, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $6, %r12
+; AVX512DQ-NEXT:    shlq $7, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $9, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $11, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k3
+; AVX512DQ-NEXT:    shlq $13, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $15, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $17, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k2
+; AVX512DQ-NEXT:    shlq $19, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $20, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $21, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $22, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $23, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $24, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512DQ-NEXT:    shlq $25, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $26, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $27, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $28, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $29, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $30, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    shlq $31, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $33, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $35, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r13d
+; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512DQ-NEXT:    shlq $37, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $39, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512DQ-NEXT:    shlq $43, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $45, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $47, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r13d
+; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512DQ-NEXT:    shlq $49, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $53, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512DQ-NEXT:    shlq $55, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $57, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $58, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $59, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $60, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    movl %ebp, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    shlq $62, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $63, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %r12, 32(%rsi)
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%rax,8), %rax
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $5, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $7, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512DQ-NEXT:    shlq $9, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $11, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $13, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512DQ-NEXT:    shlq $15, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $17, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $19, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $20, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k2
+; AVX512DQ-NEXT:    shlq $21, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $22, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $23, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $24, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $25, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $26, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    shlq $27, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %r14b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $28, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $29, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $30, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k2
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %r13
+; AVX512DQ-NEXT:    shlq $32, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512DQ-NEXT:    shlq $33, %r14
+; AVX512DQ-NEXT:    orq %r13, %r14
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $35, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $37, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k2
+; AVX512DQ-NEXT:    shlq $39, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $43, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512DQ-NEXT:    shlq $45, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $47, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $49, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %r14d
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k1
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $53, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $55, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    shlq $57, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    movl %ebp, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $58, %r13
+; AVX512DQ-NEXT:    orq %r14, %r13
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $59, %r14
+; AVX512DQ-NEXT:    orq %r13, %r14
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $60, %r13
+; AVX512DQ-NEXT:    orq %r14, %r13
+; AVX512DQ-NEXT:    movq %r12, %rbp
+; AVX512DQ-NEXT:    shlq $61, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    kmovw %k1, %r14d
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k1
+; AVX512DQ-NEXT:    shlq $62, %r12
+; AVX512DQ-NEXT:    orq %rbp, %r12
+; AVX512DQ-NEXT:    shlq $63, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %rax, 40(%rsi)
+; AVX512DQ-NEXT:    movzbl %r14b, %r15d
+; AVX512DQ-NEXT:    movl %r15d, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%rax,4), %r12
+; AVX512DQ-NEXT:    leaq (%r12,%rax,8), %rax
+; AVX512DQ-NEXT:    movzbl %r11b, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %r11
+; AVX512DQ-NEXT:    shlq $5, %r11
+; AVX512DQ-NEXT:    orq %r12, %r11
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $7, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512DQ-NEXT:    shlq $9, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $11, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $13, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k1
+; AVX512DQ-NEXT:    shlq $15, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $17, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $19, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $20, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k1
+; AVX512DQ-NEXT:    shlq $21, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $22, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $23, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $24, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $25, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $26, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    shlq $27, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %r10b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $28, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $29, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $30, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k1
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r13
+; AVX512DQ-NEXT:    shlq $32, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512DQ-NEXT:    shlq $33, %r10
+; AVX512DQ-NEXT:    orq %r13, %r10
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $35, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $37, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %r10d
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k1
+; AVX512DQ-NEXT:    shlq $39, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $43, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512DQ-NEXT:    shlq $45, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $47, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $49, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    kmovw %k1, %r13d
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k1
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r10
+; AVX512DQ-NEXT:    shlq $53, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r13, %r10
+; AVX512DQ-NEXT:    shlq $55, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    shlq $57, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movl %ebp, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $58, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $59, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $60, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k1, %r13d
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512DQ-NEXT:    shlq $62, %r10
+; AVX512DQ-NEXT:    orq %r12, %r10
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    movzbl %bpl, %ebp
+; AVX512DQ-NEXT:    shlq $63, %rbp
+; AVX512DQ-NEXT:    orq %r10, %rbp
+; AVX512DQ-NEXT:    kmovw %k0, %r10d
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512DQ-NEXT:    orq %r11, %rbp
+; AVX512DQ-NEXT:    kmovw %k0, %r11d
+; AVX512DQ-NEXT:    movq %rbp, 16(%rsi)
+; AVX512DQ-NEXT:    movzbl %r9b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    leaq (%rbx,%rbx,2), %r9
+; AVX512DQ-NEXT:    leaq (%r9,%rax,4), %r9
+; AVX512DQ-NEXT:    leaq (%r9,%rax,8), %r9
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $4, %rbx
+; AVX512DQ-NEXT:    orq %r9, %rbx
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $5, %r9
+; AVX512DQ-NEXT:    orq %rbx, %r9
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $6, %rbx
+; AVX512DQ-NEXT:    shlq $7, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movzbl %r8b, %r8d
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %rbx
+; AVX512DQ-NEXT:    shlq $8, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $9, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r8, %rbx
+; AVX512DQ-NEXT:    shlq $10, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $11, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r8, %rbx
+; AVX512DQ-NEXT:    shlq $12, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $13, %r8
+; AVX512DQ-NEXT:    orq %rbx, %r8
+; AVX512DQ-NEXT:    movzbl %dil, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $14, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $15, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $16, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $17, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $18, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    shlq $19, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movzbl %dl, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $20, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $21, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $22, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $24, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $25, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movzbl %cl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $26, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $27, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $28, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $29, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $30, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rcx
+; AVX512DQ-NEXT:    shlq $32, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r10, %rcx
+; AVX512DQ-NEXT:    shlq $34, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r10, %rcx
+; AVX512DQ-NEXT:    shlq $36, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $37, %r10
+; AVX512DQ-NEXT:    orq %rcx, %r10
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $39, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $41, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $43, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $45, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $47, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $49, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $50, %rcx
+; AVX512DQ-NEXT:    orq %r12, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $51, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $52, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $53, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $54, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    movq %r11, %rcx
+; AVX512DQ-NEXT:    shlq $56, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r11, %rcx
+; AVX512DQ-NEXT:    shlq $58, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $59, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r11, %rcx
+; AVX512DQ-NEXT:    shlq $60, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $61, %r11
+; AVX512DQ-NEXT:    orq %rcx, %r11
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    shlq $62, %r14
+; AVX512DQ-NEXT:    orq %r11, %r14
+; AVX512DQ-NEXT:    shlq $63, %r15
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    orq %r9, %r15
+; AVX512DQ-NEXT:    movq %r15, 8(%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor6_vf64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r13
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrq $15, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrq $14, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrq $13, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrq $12, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrq $26, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrq $22, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrq $58, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrq $44, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $42, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrq $5, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrq $1, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $2, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k0, %r10d
+; AVX512BW-NEXT:    movzbl %r10b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    leaq (%r10,%r10,2), %r12
+; AVX512BW-NEXT:    leaq (%r12,%r10,4), %r12
+; AVX512BW-NEXT:    leaq (%r12,%r10,8), %r12
+; AVX512BW-NEXT:    movq %r10, %r13
+; AVX512BW-NEXT:    shlq $4, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    shlq $5, %r10
+; AVX512BW-NEXT:    orq %r13, %r10
+; AVX512BW-NEXT:    movzbl %r15b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $6, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $7, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $8, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $9, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $10, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $3, %k0, %k1
+; AVX512BW-NEXT:    shlq $11, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %r13b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $12, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $13, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $14, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $15, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $16, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $4, %k0, %k1
+; AVX512BW-NEXT:    shlq $17, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movzbl %r12b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $18, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $19, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $20, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $21, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $22, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $10, %k0, %k1
+; AVX512BW-NEXT:    shlq $23, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %r15b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $24, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $25, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $26, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $27, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $28, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    shlq $29, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movzbl %r9b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $30, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $32, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r9, %r12
+; AVX512BW-NEXT:    shlq $33, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $34, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $7, %k0, %k1
+; AVX512BW-NEXT:    shlq $35, %r9
+; AVX512BW-NEXT:    orq %r13, %r9
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $36, %r13
+; AVX512BW-NEXT:    orq %r9, %r13
+; AVX512BW-NEXT:    movq %r12, %r9
+; AVX512BW-NEXT:    shlq $37, %r9
+; AVX512BW-NEXT:    orq %r13, %r9
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $38, %r13
+; AVX512BW-NEXT:    orq %r9, %r13
+; AVX512BW-NEXT:    movq %r12, %r9
+; AVX512BW-NEXT:    shlq $39, %r9
+; AVX512BW-NEXT:    orq %r13, %r9
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $40, %r13
+; AVX512BW-NEXT:    orq %r9, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrq $8, %k0, %k1
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $42, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r9, %r12
+; AVX512BW-NEXT:    shlq $43, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $44, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r9, %r12
+; AVX512BW-NEXT:    shlq $45, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $46, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $9, %k0, %k1
+; AVX512BW-NEXT:    shlq $47, %r9
+; AVX512BW-NEXT:    orq %r13, %r9
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $48, %r13
+; AVX512BW-NEXT:    orq %r9, %r13
+; AVX512BW-NEXT:    movq %r12, %r9
+; AVX512BW-NEXT:    shlq $49, %r9
+; AVX512BW-NEXT:    orq %r13, %r9
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $50, %r13
+; AVX512BW-NEXT:    orq %r9, %r13
+; AVX512BW-NEXT:    movq %r12, %r9
+; AVX512BW-NEXT:    shlq $51, %r9
+; AVX512BW-NEXT:    orq %r13, %r9
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $52, %r13
+; AVX512BW-NEXT:    orq %r9, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrq $43, %k0, %k1
+; AVX512BW-NEXT:    shlq $53, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $54, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r9, %r12
+; AVX512BW-NEXT:    shlq $55, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $56, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r9, %r12
+; AVX512BW-NEXT:    shlq $57, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $58, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    shlq $59, %r9
+; AVX512BW-NEXT:    orq %r13, %r9
+; AVX512BW-NEXT:    movl %r15d, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $60, %r13
+; AVX512BW-NEXT:    orq %r9, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $61, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $45, %k0, %k1
+; AVX512BW-NEXT:    movzbl %r15b, %r9d
+; AVX512BW-NEXT:    shlq $62, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $63, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %rax, (%rsi)
+; AVX512BW-NEXT:    movzbl %r13b, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movzbl %r8b, %r10d
+; AVX512BW-NEXT:    movl %r10d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r13,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r13,8), %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $4, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $5, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    shlq $7, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $9, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $46, %k0, %k1
+; AVX512BW-NEXT:    shlq $13, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $15, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $17, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $47, %k0, %k1
+; AVX512BW-NEXT:    shlq $19, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $21, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $23, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $48, %k0, %k1
+; AVX512BW-NEXT:    shlq $25, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $27, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $28, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $29, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $49, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $33, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $35, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $50, %k0, %k1
+; AVX512BW-NEXT:    shlq $37, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $39, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $51, %k0, %k1
+; AVX512BW-NEXT:    shlq $43, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $45, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $47, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $52, %k0, %k1
+; AVX512BW-NEXT:    shlq $49, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $51, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $53, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $53, %k0, %k1
+; AVX512BW-NEXT:    shlq $55, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $57, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $59, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $54, %k0, %k1
+; AVX512BW-NEXT:    shlq $61, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %eax
+; AVX512BW-NEXT:    movl %ebp, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    shlq $62, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $63, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $55, %k0, %k1
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movq %r12, 32(%rsi)
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r15
+; AVX512BW-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512BW-NEXT:    leaq (%r15,%rax,8), %rax
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $4, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $5, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $56, %k0, %k1
+; AVX512BW-NEXT:    shlq $9, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $13, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $57, %k0, %k1
+; AVX512BW-NEXT:    shlq $15, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $17, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $19, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $63, %k0, %k1
+; AVX512BW-NEXT:    shlq $21, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $23, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $25, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    shlq $27, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %r14b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $28, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $29, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $30, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $59, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %r13
+; AVX512BW-NEXT:    shlq $32, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $60, %k0, %k1
+; AVX512BW-NEXT:    shlq $33, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $35, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $37, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrq $61, %k0, %k1
+; AVX512BW-NEXT:    shlq $39, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $43, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $62, %k0, %k1
+; AVX512BW-NEXT:    shlq $45, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $47, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $49, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrq $21, %k0, %k1
+; AVX512BW-NEXT:    shlq $51, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $53, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $55, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    shlq $57, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movl %ebp, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $58, %r12
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    shlq $59, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $60, %r12
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r14d
+; AVX512BW-NEXT:    kshiftrq $23, %k0, %k1
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    shlq $62, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    shlq $63, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movq %r12, 40(%rsi)
+; AVX512BW-NEXT:    movzbl %r14b, %r15d
+; AVX512BW-NEXT:    movl %r15d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r12
+; AVX512BW-NEXT:    leaq (%r12,%rax,4), %r12
+; AVX512BW-NEXT:    leaq (%r12,%rax,8), %rax
+; AVX512BW-NEXT:    movzbl %bl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $4, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $5, %rbx
+; AVX512BW-NEXT:    orq %r12, %rbx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $24, %k0, %k1
+; AVX512BW-NEXT:    shlq $9, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $13, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $25, %k0, %k1
+; AVX512BW-NEXT:    shlq $15, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $17, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $19, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $31, %k0, %k1
+; AVX512BW-NEXT:    shlq $21, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $23, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $25, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    shlq $27, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %r11b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $28, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $29, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $30, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $27, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %r13
+; AVX512BW-NEXT:    shlq $32, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $28, %k0, %k1
+; AVX512BW-NEXT:    shlq $33, %r11
+; AVX512BW-NEXT:    orq %r13, %r11
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r12, %r11
+; AVX512BW-NEXT:    shlq $35, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r12, %r11
+; AVX512BW-NEXT:    shlq $37, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrq $29, %k0, %k1
+; AVX512BW-NEXT:    shlq $39, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r11, %r12
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r11, %r12
+; AVX512BW-NEXT:    shlq $43, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $30, %k0, %k1
+; AVX512BW-NEXT:    shlq $45, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r12, %r11
+; AVX512BW-NEXT:    shlq $47, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r12, %r11
+; AVX512BW-NEXT:    shlq $49, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrq $11, %k0, %k1
+; AVX512BW-NEXT:    shlq $51, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r11, %r12
+; AVX512BW-NEXT:    shlq $53, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r11, %r12
+; AVX512BW-NEXT:    shlq $55, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    shlq $57, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movl %ebp, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $58, %r12
+; AVX512BW-NEXT:    orq %r11, %r12
+; AVX512BW-NEXT:    movq %rax, %r11
+; AVX512BW-NEXT:    shlq $59, %r11
+; AVX512BW-NEXT:    orq %r12, %r11
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $60, %r12
+; AVX512BW-NEXT:    orq %r11, %r12
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrq $33, %k0, %k1
+; AVX512BW-NEXT:    shlq $62, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    shlq $63, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $34, %k0, %k1
+; AVX512BW-NEXT:    orq %rbx, %r13
+; AVX512BW-NEXT:    movq %r13, 16(%rsi)
+; AVX512BW-NEXT:    movzbl %al, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    leaq (%rbx,%rbx,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%rbx,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%rbx,8), %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $4, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    shlq $5, %rbx
+; AVX512BW-NEXT:    orq %r13, %rbx
+; AVX512BW-NEXT:    movzbl %r12b, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $35, %k0, %k1
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $13, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $15, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $36, %k0, %k1
+; AVX512BW-NEXT:    shlq $17, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $19, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $21, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $37, %k0, %k1
+; AVX512BW-NEXT:    shlq $23, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $25, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $27, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $28, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $38, %k0, %k1
+; AVX512BW-NEXT:    shlq $29, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $31, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $33, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $39, %k0, %k1
+; AVX512BW-NEXT:    shlq $35, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $37, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $39, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $40, %k0, %k1
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $43, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $45, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $41, %k0, %k1
+; AVX512BW-NEXT:    shlq $47, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbp, %r12
+; AVX512BW-NEXT:    shlq $49, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbp, %r12
+; AVX512BW-NEXT:    shlq $51, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $16, %k0, %k1
+; AVX512BW-NEXT:    shlq $53, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $55, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %rbp
+; AVX512BW-NEXT:    shlq $57, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $58, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $17, %k0, %k1
+; AVX512BW-NEXT:    shlq $59, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r8, %r13
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $18, %k0, %k1
+; AVX512BW-NEXT:    shlq $62, %r8
+; AVX512BW-NEXT:    orq %r13, %r8
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $19, %k0, %k1
+; AVX512BW-NEXT:    shlq $63, %r10
+; AVX512BW-NEXT:    orq %r8, %r10
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrq $20, %k0, %k0
+; AVX512BW-NEXT:    orq %rbx, %r10
+; AVX512BW-NEXT:    kmovd %k0, %ebx
+; AVX512BW-NEXT:    movq %r10, 24(%rsi)
+; AVX512BW-NEXT:    movzbl %r11b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    leaq (%r9,%r9,2), %r9
+; AVX512BW-NEXT:    leaq (%r9,%rax,4), %r9
+; AVX512BW-NEXT:    leaq (%r9,%rax,8), %r9
+; AVX512BW-NEXT:    movq %rax, %r10
+; AVX512BW-NEXT:    shlq $4, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $5, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movq %rax, %r10
+; AVX512BW-NEXT:    shlq $6, %r10
+; AVX512BW-NEXT:    shlq $7, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movzbl %dil, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r10
+; AVX512BW-NEXT:    shlq $8, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $9, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %rdi, %r10
+; AVX512BW-NEXT:    shlq $10, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $11, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %rdi, %r10
+; AVX512BW-NEXT:    shlq $12, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    shlq $13, %rdi
+; AVX512BW-NEXT:    orq %r10, %rdi
+; AVX512BW-NEXT:    movzbl %dl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $14, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $15, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $16, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $17, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $18, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    shlq $19, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movzbl %cl, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $20, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $21, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $22, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $23, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $24, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $25, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $26, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $27, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $28, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $29, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $30, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rcx
+; AVX512BW-NEXT:    shlq $32, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rcx
+; AVX512BW-NEXT:    shlq $34, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rcx
+; AVX512BW-NEXT:    shlq $36, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $37, %rbp
+; AVX512BW-NEXT:    orq %rcx, %rbp
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $39, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $41, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $43, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $45, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $47, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $49, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r8, %rcx
+; AVX512BW-NEXT:    shlq $51, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r8, %rcx
+; AVX512BW-NEXT:    shlq $53, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $55, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rbx, %rcx
+; AVX512BW-NEXT:    shlq $57, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %rbx, %rcx
+; AVX512BW-NEXT:    shlq $59, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    shlq $61, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    shlq $62, %r14
+; AVX512BW-NEXT:    orq %rbx, %r14
+; AVX512BW-NEXT:    shlq $63, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    orq %r9, %r15
+; AVX512BW-NEXT:    movq %r15, 8(%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r13
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <64 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+  store <384 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor7_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor7_vf2:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    leal (%rdx,%rdx,2), %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rdx,4), %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rdx,8), %eax
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $4, %ecx
+; AVX512F-ONLY-NEXT:    orl %eax, %ecx
+; AVX512F-ONLY-NEXT:    movl %edx, %eax
+; AVX512F-ONLY-NEXT:    shll $5, %eax
+; AVX512F-ONLY-NEXT:    orl %ecx, %eax
+; AVX512F-ONLY-NEXT:    shll $6, %edx
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $7, %r8d
+; AVX512F-ONLY-NEXT:    orl %edx, %r8d
+; AVX512F-ONLY-NEXT:    movl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $8, %edx
+; AVX512F-ONLY-NEXT:    orl %r8d, %edx
+; AVX512F-ONLY-NEXT:    movl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $9, %r8d
+; AVX512F-ONLY-NEXT:    orl %edx, %r8d
+; AVX512F-ONLY-NEXT:    movl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $10, %edx
+; AVX512F-ONLY-NEXT:    orl %r8d, %edx
+; AVX512F-ONLY-NEXT:    movl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $11, %r8d
+; AVX512F-ONLY-NEXT:    orl %edx, %r8d
+; AVX512F-ONLY-NEXT:    shll $12, %edi
+; AVX512F-ONLY-NEXT:    orl %r8d, %edi
+; AVX512F-ONLY-NEXT:    shll $13, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    orl %eax, %ecx
+; AVX512F-ONLY-NEXT:    andl $16383, %ecx # imm = 0x3FFF
+; AVX512F-ONLY-NEXT:    movw %cx, (%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor7_vf2:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k1
+; AVX512DQ-NEXT:    kshiftrb $1, %k1, %k0
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    leal (%rdx,%rdx,2), %eax
+; AVX512DQ-NEXT:    leal (%rax,%rdx,4), %eax
+; AVX512DQ-NEXT:    leal (%rax,%rdx,8), %eax
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $4, %ecx
+; AVX512DQ-NEXT:    orl %eax, %ecx
+; AVX512DQ-NEXT:    movl %edx, %eax
+; AVX512DQ-NEXT:    shll $5, %eax
+; AVX512DQ-NEXT:    orl %ecx, %eax
+; AVX512DQ-NEXT:    shll $6, %edx
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    movl %ecx, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movl %edi, %r8d
+; AVX512DQ-NEXT:    shll $7, %r8d
+; AVX512DQ-NEXT:    orl %edx, %r8d
+; AVX512DQ-NEXT:    movl %edi, %edx
+; AVX512DQ-NEXT:    shll $8, %edx
+; AVX512DQ-NEXT:    orl %r8d, %edx
+; AVX512DQ-NEXT:    movl %edi, %r8d
+; AVX512DQ-NEXT:    shll $9, %r8d
+; AVX512DQ-NEXT:    orl %edx, %r8d
+; AVX512DQ-NEXT:    movl %edi, %edx
+; AVX512DQ-NEXT:    shll $10, %edx
+; AVX512DQ-NEXT:    orl %r8d, %edx
+; AVX512DQ-NEXT:    movl %edi, %r8d
+; AVX512DQ-NEXT:    shll $11, %r8d
+; AVX512DQ-NEXT:    orl %edx, %r8d
+; AVX512DQ-NEXT:    shll $12, %edi
+; AVX512DQ-NEXT:    orl %r8d, %edi
+; AVX512DQ-NEXT:    shll $13, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    orl %eax, %ecx
+; AVX512DQ-NEXT:    andl $16383, %ecx # imm = 0x3FFF
+; AVX512DQ-NEXT:    movw %cx, (%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor7_vf2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k0
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    leal (%rdx,%rdx,2), %eax
+; AVX512BW-NEXT:    leal (%rax,%rdx,4), %eax
+; AVX512BW-NEXT:    leal (%rax,%rdx,8), %eax
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $4, %ecx
+; AVX512BW-NEXT:    orl %eax, %ecx
+; AVX512BW-NEXT:    movl %edx, %eax
+; AVX512BW-NEXT:    shll $5, %eax
+; AVX512BW-NEXT:    orl %ecx, %eax
+; AVX512BW-NEXT:    shll $6, %edx
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    movl %ecx, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movl %edi, %r8d
+; AVX512BW-NEXT:    shll $7, %r8d
+; AVX512BW-NEXT:    orl %edx, %r8d
+; AVX512BW-NEXT:    movl %edi, %edx
+; AVX512BW-NEXT:    shll $8, %edx
+; AVX512BW-NEXT:    orl %r8d, %edx
+; AVX512BW-NEXT:    movl %edi, %r8d
+; AVX512BW-NEXT:    shll $9, %r8d
+; AVX512BW-NEXT:    orl %edx, %r8d
+; AVX512BW-NEXT:    movl %edi, %edx
+; AVX512BW-NEXT:    shll $10, %edx
+; AVX512BW-NEXT:    orl %r8d, %edx
+; AVX512BW-NEXT:    movl %edi, %r8d
+; AVX512BW-NEXT:    shll $11, %r8d
+; AVX512BW-NEXT:    orl %edx, %r8d
+; AVX512BW-NEXT:    shll $12, %edi
+; AVX512BW-NEXT:    orl %r8d, %edi
+; AVX512BW-NEXT:    shll $13, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    orl %eax, %ecx
+; AVX512BW-NEXT:    andl $16383, %ecx # imm = 0x3FFF
+; AVX512BW-NEXT:    movw %cx, (%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <2 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  store <14 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor7_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor7_vf4:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k3
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k3, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    leal (%rdx,%rdx,2), %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rdx,4), %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%rdx,8), %eax
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $4, %ecx
+; AVX512F-ONLY-NEXT:    orl %eax, %ecx
+; AVX512F-ONLY-NEXT:    movl %edx, %eax
+; AVX512F-ONLY-NEXT:    shll $5, %eax
+; AVX512F-ONLY-NEXT:    orl %ecx, %eax
+; AVX512F-ONLY-NEXT:    shll $6, %edx
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edi
+; AVX512F-ONLY-NEXT:    shll $7, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $8, %edx
+; AVX512F-ONLY-NEXT:    orl %edi, %edx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edi
+; AVX512F-ONLY-NEXT:    shll $9, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $10, %edx
+; AVX512F-ONLY-NEXT:    orl %edi, %edx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edi
+; AVX512F-ONLY-NEXT:    shll $11, %edi
+; AVX512F-ONLY-NEXT:    orl %edx, %edi
+; AVX512F-ONLY-NEXT:    movl %ecx, %edx
+; AVX512F-ONLY-NEXT:    shll $12, %edx
+; AVX512F-ONLY-NEXT:    orl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $13, %ecx
+; AVX512F-ONLY-NEXT:    orl %edx, %ecx
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $14, %edi
+; AVX512F-ONLY-NEXT:    orl %ecx, %edi
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $15, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $16, %edi
+; AVX512F-ONLY-NEXT:    orl %ecx, %edi
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $17, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    movl %edx, %edi
+; AVX512F-ONLY-NEXT:    shll $18, %edi
+; AVX512F-ONLY-NEXT:    orl %ecx, %edi
+; AVX512F-ONLY-NEXT:    movl %edx, %ecx
+; AVX512F-ONLY-NEXT:    shll $19, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    shll $20, %edx
+; AVX512F-ONLY-NEXT:    orl %ecx, %edx
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
+; AVX512F-ONLY-NEXT:    movl %ecx, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $21, %r8d
+; AVX512F-ONLY-NEXT:    orl %edx, %r8d
+; AVX512F-ONLY-NEXT:    movl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $22, %edx
+; AVX512F-ONLY-NEXT:    orl %r8d, %edx
+; AVX512F-ONLY-NEXT:    movl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $23, %r8d
+; AVX512F-ONLY-NEXT:    orl %edx, %r8d
+; AVX512F-ONLY-NEXT:    movl %edi, %edx
+; AVX512F-ONLY-NEXT:    shll $24, %edx
+; AVX512F-ONLY-NEXT:    orl %r8d, %edx
+; AVX512F-ONLY-NEXT:    movl %edi, %r8d
+; AVX512F-ONLY-NEXT:    shll $25, %r8d
+; AVX512F-ONLY-NEXT:    orl %edx, %r8d
+; AVX512F-ONLY-NEXT:    shll $26, %edi
+; AVX512F-ONLY-NEXT:    orl %r8d, %edi
+; AVX512F-ONLY-NEXT:    shll $27, %ecx
+; AVX512F-ONLY-NEXT:    orl %edi, %ecx
+; AVX512F-ONLY-NEXT:    orl %eax, %ecx
+; AVX512F-ONLY-NEXT:    andl $268435455, %ecx # imm = 0xFFFFFFF
+; AVX512F-ONLY-NEXT:    movl %ecx, (%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor7_vf4:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k3
+; AVX512DQ-NEXT:    kshiftrb $3, %k3, %k0
+; AVX512DQ-NEXT:    kshiftrb $2, %k3, %k1
+; AVX512DQ-NEXT:    kshiftrb $1, %k3, %k2
+; AVX512DQ-NEXT:    kmovw %k3, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    leal (%rdx,%rdx,2), %eax
+; AVX512DQ-NEXT:    leal (%rax,%rdx,4), %eax
+; AVX512DQ-NEXT:    leal (%rax,%rdx,8), %eax
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $4, %ecx
+; AVX512DQ-NEXT:    orl %eax, %ecx
+; AVX512DQ-NEXT:    movl %edx, %eax
+; AVX512DQ-NEXT:    shll $5, %eax
+; AVX512DQ-NEXT:    orl %ecx, %eax
+; AVX512DQ-NEXT:    shll $6, %edx
+; AVX512DQ-NEXT:    kmovw %k2, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movl %ecx, %edi
+; AVX512DQ-NEXT:    shll $7, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    shll $8, %edx
+; AVX512DQ-NEXT:    orl %edi, %edx
+; AVX512DQ-NEXT:    movl %ecx, %edi
+; AVX512DQ-NEXT:    shll $9, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    shll $10, %edx
+; AVX512DQ-NEXT:    orl %edi, %edx
+; AVX512DQ-NEXT:    movl %ecx, %edi
+; AVX512DQ-NEXT:    shll $11, %edi
+; AVX512DQ-NEXT:    orl %edx, %edi
+; AVX512DQ-NEXT:    movl %ecx, %edx
+; AVX512DQ-NEXT:    shll $12, %edx
+; AVX512DQ-NEXT:    orl %edi, %edx
+; AVX512DQ-NEXT:    shll $13, %ecx
+; AVX512DQ-NEXT:    orl %edx, %ecx
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $14, %edi
+; AVX512DQ-NEXT:    orl %ecx, %edi
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $15, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $16, %edi
+; AVX512DQ-NEXT:    orl %ecx, %edi
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $17, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    movl %edx, %edi
+; AVX512DQ-NEXT:    shll $18, %edi
+; AVX512DQ-NEXT:    orl %ecx, %edi
+; AVX512DQ-NEXT:    movl %edx, %ecx
+; AVX512DQ-NEXT:    shll $19, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    shll $20, %edx
+; AVX512DQ-NEXT:    orl %ecx, %edx
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    movl %ecx, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movl %edi, %r8d
+; AVX512DQ-NEXT:    shll $21, %r8d
+; AVX512DQ-NEXT:    orl %edx, %r8d
+; AVX512DQ-NEXT:    movl %edi, %edx
+; AVX512DQ-NEXT:    shll $22, %edx
+; AVX512DQ-NEXT:    orl %r8d, %edx
+; AVX512DQ-NEXT:    movl %edi, %r8d
+; AVX512DQ-NEXT:    shll $23, %r8d
+; AVX512DQ-NEXT:    orl %edx, %r8d
+; AVX512DQ-NEXT:    movl %edi, %edx
+; AVX512DQ-NEXT:    shll $24, %edx
+; AVX512DQ-NEXT:    orl %r8d, %edx
+; AVX512DQ-NEXT:    movl %edi, %r8d
+; AVX512DQ-NEXT:    shll $25, %r8d
+; AVX512DQ-NEXT:    orl %edx, %r8d
+; AVX512DQ-NEXT:    shll $26, %edi
+; AVX512DQ-NEXT:    orl %r8d, %edi
+; AVX512DQ-NEXT:    shll $27, %ecx
+; AVX512DQ-NEXT:    orl %edi, %ecx
+; AVX512DQ-NEXT:    orl %eax, %ecx
+; AVX512DQ-NEXT:    andl $268435455, %ecx # imm = 0xFFFFFFF
+; AVX512DQ-NEXT:    movl %ecx, (%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor7_vf4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k3
+; AVX512BW-NEXT:    kshiftrw $3, %k3, %k0
+; AVX512BW-NEXT:    kshiftrw $2, %k3, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k3, %k2
+; AVX512BW-NEXT:    kmovd %k3, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    leal (%rdx,%rdx,2), %eax
+; AVX512BW-NEXT:    leal (%rax,%rdx,4), %eax
+; AVX512BW-NEXT:    leal (%rax,%rdx,8), %eax
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $4, %ecx
+; AVX512BW-NEXT:    orl %eax, %ecx
+; AVX512BW-NEXT:    movl %edx, %eax
+; AVX512BW-NEXT:    shll $5, %eax
+; AVX512BW-NEXT:    orl %ecx, %eax
+; AVX512BW-NEXT:    shll $6, %edx
+; AVX512BW-NEXT:    kmovd %k2, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movl %ecx, %edi
+; AVX512BW-NEXT:    shll $7, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    shll $8, %edx
+; AVX512BW-NEXT:    orl %edi, %edx
+; AVX512BW-NEXT:    movl %ecx, %edi
+; AVX512BW-NEXT:    shll $9, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    shll $10, %edx
+; AVX512BW-NEXT:    orl %edi, %edx
+; AVX512BW-NEXT:    movl %ecx, %edi
+; AVX512BW-NEXT:    shll $11, %edi
+; AVX512BW-NEXT:    orl %edx, %edi
+; AVX512BW-NEXT:    movl %ecx, %edx
+; AVX512BW-NEXT:    shll $12, %edx
+; AVX512BW-NEXT:    orl %edi, %edx
+; AVX512BW-NEXT:    shll $13, %ecx
+; AVX512BW-NEXT:    orl %edx, %ecx
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $14, %edi
+; AVX512BW-NEXT:    orl %ecx, %edi
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $15, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $16, %edi
+; AVX512BW-NEXT:    orl %ecx, %edi
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $17, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    movl %edx, %edi
+; AVX512BW-NEXT:    shll $18, %edi
+; AVX512BW-NEXT:    orl %ecx, %edi
+; AVX512BW-NEXT:    movl %edx, %ecx
+; AVX512BW-NEXT:    shll $19, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    shll $20, %edx
+; AVX512BW-NEXT:    orl %ecx, %edx
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    movl %ecx, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movl %edi, %r8d
+; AVX512BW-NEXT:    shll $21, %r8d
+; AVX512BW-NEXT:    orl %edx, %r8d
+; AVX512BW-NEXT:    movl %edi, %edx
+; AVX512BW-NEXT:    shll $22, %edx
+; AVX512BW-NEXT:    orl %r8d, %edx
+; AVX512BW-NEXT:    movl %edi, %r8d
+; AVX512BW-NEXT:    shll $23, %r8d
+; AVX512BW-NEXT:    orl %edx, %r8d
+; AVX512BW-NEXT:    movl %edi, %edx
+; AVX512BW-NEXT:    shll $24, %edx
+; AVX512BW-NEXT:    orl %r8d, %edx
+; AVX512BW-NEXT:    movl %edi, %r8d
+; AVX512BW-NEXT:    shll $25, %r8d
+; AVX512BW-NEXT:    orl %edx, %r8d
+; AVX512BW-NEXT:    shll $26, %edi
+; AVX512BW-NEXT:    orl %r8d, %edi
+; AVX512BW-NEXT:    shll $27, %ecx
+; AVX512BW-NEXT:    orl %edi, %ecx
+; AVX512BW-NEXT:    orl %eax, %ecx
+; AVX512BW-NEXT:    andl $268435455, %ecx # imm = 0xFFFFFFF
+; AVX512BW-NEXT:    movl %ecx, (%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <4 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  store <28 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor7_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor7_vf8:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k2
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k2, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k2, %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k2, %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k2, %k3
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k3
+; AVX512F-ONLY-NEXT:    kmovw %k3, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k3
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r9d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
+; AVX512F-ONLY-NEXT:    movzbl %dil, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    leaq (%r10,%r10,2), %rdi
+; AVX512F-ONLY-NEXT:    leaq (%rdi,%r10,4), %rdi
+; AVX512F-ONLY-NEXT:    leaq (%rdi,%r10,8), %rdi
+; AVX512F-ONLY-NEXT:    movq %r10, %r11
+; AVX512F-ONLY-NEXT:    shlq $4, %r11
+; AVX512F-ONLY-NEXT:    orq %rdi, %r11
+; AVX512F-ONLY-NEXT:    movq %r10, %rdi
+; AVX512F-ONLY-NEXT:    shlq $5, %rdi
+; AVX512F-ONLY-NEXT:    orq %r11, %rdi
+; AVX512F-ONLY-NEXT:    shlq $6, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $7, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $8, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $9, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $10, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $11, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $12, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    shlq $13, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %r10
+; AVX512F-ONLY-NEXT:    shlq $14, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $15, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %r8, %r10
+; AVX512F-ONLY-NEXT:    shlq $16, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $17, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %r8, %r10
+; AVX512F-ONLY-NEXT:    shlq $18, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $19, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    shlq $20, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    movzbl %dl, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %rdx
+; AVX512F-ONLY-NEXT:    shlq $21, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    movq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $22, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    movq %r9, %rdx
+; AVX512F-ONLY-NEXT:    shlq $23, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    movq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $24, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    movq %r9, %rdx
+; AVX512F-ONLY-NEXT:    shlq $25, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    movq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $26, %r8
+; AVX512F-ONLY-NEXT:    orq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $27, %r9
+; AVX512F-ONLY-NEXT:    orq %r8, %r9
+; AVX512F-ONLY-NEXT:    movzbl %cl, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $28, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    movq %rdx, %r9
+; AVX512F-ONLY-NEXT:    shlq $29, %r9
+; AVX512F-ONLY-NEXT:    orq %r8, %r9
+; AVX512F-ONLY-NEXT:    movq %rdx, %r8
+; AVX512F-ONLY-NEXT:    shlq $30, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    shlq $31, %rdx
+; AVX512F-ONLY-NEXT:    orq %r8, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movl %edx, (%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $32, %rdi
+; AVX512F-ONLY-NEXT:    movq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $33, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $34, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $35, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    shlq $36, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $37, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    shlq $38, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $39, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    shlq $40, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    shlq $41, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $42, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $43, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $44, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $45, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    movq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $46, %r8
+; AVX512F-ONLY-NEXT:    orq %rdi, %r8
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $47, %rdi
+; AVX512F-ONLY-NEXT:    orq %r8, %rdi
+; AVX512F-ONLY-NEXT:    shlq $48, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    movzbl %al, %edi
+; AVX512F-ONLY-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $49, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $50, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $51, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $52, %rcx
+; AVX512F-ONLY-NEXT:    orq %r8, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $53, %r8
+; AVX512F-ONLY-NEXT:    orq %rcx, %r8
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $55, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shrq $48, %rax
+; AVX512F-ONLY-NEXT:    movb %al, 6(%rsi)
+; AVX512F-ONLY-NEXT:    shrq $32, %rdi
+; AVX512F-ONLY-NEXT:    movw %di, 4(%rsi)
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor7_vf8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k2
+; AVX512DQ-NEXT:    kshiftrb $7, %k2, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    kshiftrb $6, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrb $5, %k2, %k1
+; AVX512DQ-NEXT:    kshiftrb $4, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %k3, %ecx
+; AVX512DQ-NEXT:    kshiftrb $3, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %k3, %edx
+; AVX512DQ-NEXT:    kshiftrb $2, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %k3, %r8d
+; AVX512DQ-NEXT:    kshiftrb $1, %k2, %k3
+; AVX512DQ-NEXT:    kmovw %k3, %r9d
+; AVX512DQ-NEXT:    kmovw %k2, %edi
+; AVX512DQ-NEXT:    movzbl %dil, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    leaq (%r10,%r10,2), %rdi
+; AVX512DQ-NEXT:    leaq (%rdi,%r10,4), %rdi
+; AVX512DQ-NEXT:    leaq (%rdi,%r10,8), %rdi
+; AVX512DQ-NEXT:    movq %r10, %r11
+; AVX512DQ-NEXT:    shlq $4, %r11
+; AVX512DQ-NEXT:    orq %rdi, %r11
+; AVX512DQ-NEXT:    movq %r10, %rdi
+; AVX512DQ-NEXT:    shlq $5, %rdi
+; AVX512DQ-NEXT:    orq %r11, %rdi
+; AVX512DQ-NEXT:    shlq $6, %r10
+; AVX512DQ-NEXT:    movzbl %r9b, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $7, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $8, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $9, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $10, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $11, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $12, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    shlq $13, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movzbl %r8b, %r8d
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %r10
+; AVX512DQ-NEXT:    shlq $14, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $15, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %r8, %r10
+; AVX512DQ-NEXT:    shlq $16, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $17, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %r8, %r10
+; AVX512DQ-NEXT:    shlq $18, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $19, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    shlq $20, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    movzbl %dl, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %rdx
+; AVX512DQ-NEXT:    shlq $21, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    movq %r9, %r8
+; AVX512DQ-NEXT:    shlq $22, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    movq %r9, %rdx
+; AVX512DQ-NEXT:    shlq $23, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    movq %r9, %r8
+; AVX512DQ-NEXT:    shlq $24, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    movq %r9, %rdx
+; AVX512DQ-NEXT:    shlq $25, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    movq %r9, %r8
+; AVX512DQ-NEXT:    shlq $26, %r8
+; AVX512DQ-NEXT:    orq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $27, %r9
+; AVX512DQ-NEXT:    orq %r8, %r9
+; AVX512DQ-NEXT:    movzbl %cl, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $28, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    movq %rdx, %r9
+; AVX512DQ-NEXT:    shlq $29, %r9
+; AVX512DQ-NEXT:    orq %r8, %r9
+; AVX512DQ-NEXT:    movq %rdx, %r8
+; AVX512DQ-NEXT:    shlq $30, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    shlq $31, %rdx
+; AVX512DQ-NEXT:    orq %r8, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movl %edx, (%rsi)
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $32, %rdi
+; AVX512DQ-NEXT:    movq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $33, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $34, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    kmovw %k1, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $35, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rcx
+; AVX512DQ-NEXT:    shlq $36, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $37, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rcx
+; AVX512DQ-NEXT:    shlq $38, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $39, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rcx
+; AVX512DQ-NEXT:    shlq $40, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    shlq $41, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $42, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $43, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $44, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $45, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    movq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $46, %r8
+; AVX512DQ-NEXT:    orq %rdi, %r8
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $47, %rdi
+; AVX512DQ-NEXT:    orq %r8, %rdi
+; AVX512DQ-NEXT:    shlq $48, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    movzbl %al, %edi
+; AVX512DQ-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $49, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $50, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $51, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $52, %rcx
+; AVX512DQ-NEXT:    orq %r8, %rcx
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $53, %r8
+; AVX512DQ-NEXT:    orq %rcx, %r8
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    shlq $55, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shrq $48, %rax
+; AVX512DQ-NEXT:    movb %al, 6(%rsi)
+; AVX512DQ-NEXT:    shrq $32, %rdi
+; AVX512DQ-NEXT:    movw %di, 4(%rsi)
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor7_vf8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k2
+; AVX512BW-NEXT:    kshiftrw $7, %k2, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    kshiftrw $6, %k2, %k0
+; AVX512BW-NEXT:    kshiftrw $5, %k2, %k1
+; AVX512BW-NEXT:    kshiftrw $4, %k2, %k3
+; AVX512BW-NEXT:    kmovd %k3, %ecx
+; AVX512BW-NEXT:    kshiftrw $3, %k2, %k3
+; AVX512BW-NEXT:    kmovd %k3, %edx
+; AVX512BW-NEXT:    kshiftrw $2, %k2, %k3
+; AVX512BW-NEXT:    kmovd %k3, %r8d
+; AVX512BW-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512BW-NEXT:    kmovd %k3, %r9d
+; AVX512BW-NEXT:    kmovd %k2, %edi
+; AVX512BW-NEXT:    movzbl %dil, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    leaq (%r10,%r10,2), %rdi
+; AVX512BW-NEXT:    leaq (%rdi,%r10,4), %rdi
+; AVX512BW-NEXT:    leaq (%rdi,%r10,8), %rdi
+; AVX512BW-NEXT:    movq %r10, %r11
+; AVX512BW-NEXT:    shlq $4, %r11
+; AVX512BW-NEXT:    orq %rdi, %r11
+; AVX512BW-NEXT:    movq %r10, %rdi
+; AVX512BW-NEXT:    shlq $5, %rdi
+; AVX512BW-NEXT:    orq %r11, %rdi
+; AVX512BW-NEXT:    shlq $6, %r10
+; AVX512BW-NEXT:    movzbl %r9b, %r9d
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $7, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $8, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $9, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $10, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $11, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $12, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    shlq $13, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movzbl %r8b, %r8d
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %r10
+; AVX512BW-NEXT:    shlq $14, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $15, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movq %r8, %r10
+; AVX512BW-NEXT:    shlq $16, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $17, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movq %r8, %r10
+; AVX512BW-NEXT:    shlq $18, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $19, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    shlq $20, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    movzbl %dl, %r9d
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %rdx
+; AVX512BW-NEXT:    shlq $21, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    movq %r9, %r8
+; AVX512BW-NEXT:    shlq $22, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    movq %r9, %rdx
+; AVX512BW-NEXT:    shlq $23, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    movq %r9, %r8
+; AVX512BW-NEXT:    shlq $24, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    movq %r9, %rdx
+; AVX512BW-NEXT:    shlq $25, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    movq %r9, %r8
+; AVX512BW-NEXT:    shlq $26, %r8
+; AVX512BW-NEXT:    orq %rdx, %r8
+; AVX512BW-NEXT:    shlq $27, %r9
+; AVX512BW-NEXT:    orq %r8, %r9
+; AVX512BW-NEXT:    movzbl %cl, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %r8
+; AVX512BW-NEXT:    shlq $28, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    movq %rdx, %r9
+; AVX512BW-NEXT:    shlq $29, %r9
+; AVX512BW-NEXT:    orq %r8, %r9
+; AVX512BW-NEXT:    movq %rdx, %r8
+; AVX512BW-NEXT:    shlq $30, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    shlq $31, %rdx
+; AVX512BW-NEXT:    orq %r8, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movl %edx, (%rsi)
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $32, %rdi
+; AVX512BW-NEXT:    movq %rcx, %r8
+; AVX512BW-NEXT:    shlq $33, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    shlq $34, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $35, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    movq %rdi, %rcx
+; AVX512BW-NEXT:    shlq $36, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $37, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    movq %rdi, %rcx
+; AVX512BW-NEXT:    shlq $38, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $39, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    movq %rdi, %rcx
+; AVX512BW-NEXT:    shlq $40, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    shlq $41, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %r8
+; AVX512BW-NEXT:    shlq $42, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $43, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    movq %rcx, %r8
+; AVX512BW-NEXT:    shlq $44, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $45, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    movq %rcx, %r8
+; AVX512BW-NEXT:    shlq $46, %r8
+; AVX512BW-NEXT:    orq %rdi, %r8
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $47, %rdi
+; AVX512BW-NEXT:    orq %r8, %rdi
+; AVX512BW-NEXT:    shlq $48, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    movzbl %al, %edi
+; AVX512BW-NEXT:    # kill: def $eax killed $eax def $rax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $49, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $50, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $51, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $52, %rcx
+; AVX512BW-NEXT:    orq %r8, %rcx
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $53, %r8
+; AVX512BW-NEXT:    orq %rcx, %r8
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    shlq $55, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shrq $48, %rax
+; AVX512BW-NEXT:    movb %al, 6(%rsi)
+; AVX512BW-NEXT:    shrq $32, %rdi
+; AVX512BW-NEXT:    movw %di, 4(%rsi)
+; AVX512BW-NEXT:    retq
+  %src.vec = load <8 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  store <56 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor7_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor7_vf16:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k4
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k4, %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k4, %k1
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k4, %k2
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k4, %k3
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k4, %k5
+; AVX512F-ONLY-NEXT:    kmovw %k5, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k4, %k5
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k4, %k6
+; AVX512F-ONLY-NEXT:    kmovw %k6, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k4, %k6
+; AVX512F-ONLY-NEXT:    kmovw %k6, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k4, %k6
+; AVX512F-ONLY-NEXT:    kmovw %k6, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k4, %k6
+; AVX512F-ONLY-NEXT:    kmovw %k6, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k4, %k6
+; AVX512F-ONLY-NEXT:    kmovw %k6, %eax
+; AVX512F-ONLY-NEXT:    movzbl %al, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, %r11d
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    leaq (%r11,%r11,2), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r11,4), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%r11,8), %r15
+; AVX512F-ONLY-NEXT:    movq %r11, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $5, %r11
+; AVX512F-ONLY-NEXT:    orq %r12, %r11
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r14
+; AVX512F-ONLY-NEXT:    shlq $6, %r14
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r12
+; AVX512F-ONLY-NEXT:    orq %r14, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r14
+; AVX512F-ONLY-NEXT:    shlq $8, %r14
+; AVX512F-ONLY-NEXT:    orq %r12, %r14
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $9, %r12
+; AVX512F-ONLY-NEXT:    orq %r14, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %r14
+; AVX512F-ONLY-NEXT:    shlq $10, %r14
+; AVX512F-ONLY-NEXT:    orq %r12, %r14
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $11, %r12
+; AVX512F-ONLY-NEXT:    orq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $12, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r14d
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $13, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %r15
+; AVX512F-ONLY-NEXT:    shlq $14, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $15, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %r15
+; AVX512F-ONLY-NEXT:    shlq $16, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $17, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %r15
+; AVX512F-ONLY-NEXT:    shlq $18, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $19, %r14
+; AVX512F-ONLY-NEXT:    orq %r15, %r14
+; AVX512F-ONLY-NEXT:    movzbl %bl, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rbx
+; AVX512F-ONLY-NEXT:    shlq $20, %rbx
+; AVX512F-ONLY-NEXT:    orq %r14, %rbx
+; AVX512F-ONLY-NEXT:    movq %r15, %r14
+; AVX512F-ONLY-NEXT:    shlq $21, %r14
+; AVX512F-ONLY-NEXT:    orq %rbx, %r14
+; AVX512F-ONLY-NEXT:    movq %r15, %rbx
+; AVX512F-ONLY-NEXT:    shlq $22, %rbx
+; AVX512F-ONLY-NEXT:    orq %r14, %rbx
+; AVX512F-ONLY-NEXT:    movq %r15, %r14
+; AVX512F-ONLY-NEXT:    shlq $23, %r14
+; AVX512F-ONLY-NEXT:    orq %rbx, %r14
+; AVX512F-ONLY-NEXT:    movq %r15, %rbx
+; AVX512F-ONLY-NEXT:    shlq $24, %rbx
+; AVX512F-ONLY-NEXT:    orq %r14, %rbx
+; AVX512F-ONLY-NEXT:    movq %r15, %r14
+; AVX512F-ONLY-NEXT:    shlq $25, %r14
+; AVX512F-ONLY-NEXT:    orq %rbx, %r14
+; AVX512F-ONLY-NEXT:    shlq $26, %r15
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    movq %rbx, %r14
+; AVX512F-ONLY-NEXT:    shlq $27, %r14
+; AVX512F-ONLY-NEXT:    orq %r15, %r14
+; AVX512F-ONLY-NEXT:    movq %rbx, %r15
+; AVX512F-ONLY-NEXT:    shlq $28, %r15
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    movq %rbx, %r14
+; AVX512F-ONLY-NEXT:    shlq $29, %r14
+; AVX512F-ONLY-NEXT:    orq %r15, %r14
+; AVX512F-ONLY-NEXT:    movq %rbx, %r15
+; AVX512F-ONLY-NEXT:    shlq $30, %r15
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    shlq $31, %rbx
+; AVX512F-ONLY-NEXT:    orq %r15, %rbx
+; AVX512F-ONLY-NEXT:    orq %r11, %rbx
+; AVX512F-ONLY-NEXT:    movl %ebx, 8(%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r11
+; AVX512F-ONLY-NEXT:    shlq $32, %r11
+; AVX512F-ONLY-NEXT:    shlq $33, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    kmovw %k5, %r11d
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    movq %r11, %r14
+; AVX512F-ONLY-NEXT:    shlq $34, %r14
+; AVX512F-ONLY-NEXT:    orq %r10, %r14
+; AVX512F-ONLY-NEXT:    movq %r11, %r10
+; AVX512F-ONLY-NEXT:    shlq $35, %r10
+; AVX512F-ONLY-NEXT:    orq %r14, %r10
+; AVX512F-ONLY-NEXT:    movq %r11, %r14
+; AVX512F-ONLY-NEXT:    shlq $36, %r14
+; AVX512F-ONLY-NEXT:    orq %r10, %r14
+; AVX512F-ONLY-NEXT:    movq %r11, %r10
+; AVX512F-ONLY-NEXT:    shlq $37, %r10
+; AVX512F-ONLY-NEXT:    orq %r14, %r10
+; AVX512F-ONLY-NEXT:    movq %r11, %r14
+; AVX512F-ONLY-NEXT:    shlq $38, %r14
+; AVX512F-ONLY-NEXT:    orq %r10, %r14
+; AVX512F-ONLY-NEXT:    movq %r11, %r10
+; AVX512F-ONLY-NEXT:    shlq $39, %r10
+; AVX512F-ONLY-NEXT:    orq %r14, %r10
+; AVX512F-ONLY-NEXT:    shlq $40, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r10d
+; AVX512F-ONLY-NEXT:    # kill: def $r8d killed $r8d def $r8
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %r14
+; AVX512F-ONLY-NEXT:    shlq $41, %r14
+; AVX512F-ONLY-NEXT:    orq %r11, %r14
+; AVX512F-ONLY-NEXT:    movq %r8, %r11
+; AVX512F-ONLY-NEXT:    shlq $42, %r11
+; AVX512F-ONLY-NEXT:    orq %r14, %r11
+; AVX512F-ONLY-NEXT:    movq %r8, %r14
+; AVX512F-ONLY-NEXT:    shlq $43, %r14
+; AVX512F-ONLY-NEXT:    orq %r11, %r14
+; AVX512F-ONLY-NEXT:    movq %r8, %r11
+; AVX512F-ONLY-NEXT:    shlq $44, %r11
+; AVX512F-ONLY-NEXT:    orq %r14, %r11
+; AVX512F-ONLY-NEXT:    movq %r8, %r14
+; AVX512F-ONLY-NEXT:    shlq $45, %r14
+; AVX512F-ONLY-NEXT:    orq %r11, %r14
+; AVX512F-ONLY-NEXT:    shlq $46, %r8
+; AVX512F-ONLY-NEXT:    orq %r14, %r8
+; AVX512F-ONLY-NEXT:    shlq $47, %r10
+; AVX512F-ONLY-NEXT:    orq %r8, %r10
+; AVX512F-ONLY-NEXT:    orq %rbx, %r10
+; AVX512F-ONLY-NEXT:    shrq $32, %r10
+; AVX512F-ONLY-NEXT:    movw %r10w, 12(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    leaq (%r10,%r10,2), %r8
+; AVX512F-ONLY-NEXT:    leaq (%r8,%r10,4), %r8
+; AVX512F-ONLY-NEXT:    leaq (%r8,%r10,8), %r8
+; AVX512F-ONLY-NEXT:    movq %r10, %r11
+; AVX512F-ONLY-NEXT:    shlq $4, %r11
+; AVX512F-ONLY-NEXT:    orq %r8, %r11
+; AVX512F-ONLY-NEXT:    movq %r10, %r8
+; AVX512F-ONLY-NEXT:    shlq $5, %r8
+; AVX512F-ONLY-NEXT:    orq %r11, %r8
+; AVX512F-ONLY-NEXT:    shlq $6, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $7, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $8, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $9, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $10, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    movq %r9, %r11
+; AVX512F-ONLY-NEXT:    shlq $11, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $12, %r10
+; AVX512F-ONLY-NEXT:    orq %r11, %r10
+; AVX512F-ONLY-NEXT:    shlq $13, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r10
+; AVX512F-ONLY-NEXT:    shlq $14, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %rdi, %r9
+; AVX512F-ONLY-NEXT:    shlq $15, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %rdi, %r10
+; AVX512F-ONLY-NEXT:    shlq $16, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %rdi, %r9
+; AVX512F-ONLY-NEXT:    shlq $17, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %rdi, %r10
+; AVX512F-ONLY-NEXT:    shlq $18, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %rdi, %r9
+; AVX512F-ONLY-NEXT:    shlq $19, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    shlq $20, %rdi
+; AVX512F-ONLY-NEXT:    orq %r9, %rdi
+; AVX512F-ONLY-NEXT:    movzbl %dl, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %r9
+; AVX512F-ONLY-NEXT:    shlq $21, %r9
+; AVX512F-ONLY-NEXT:    orq %rdi, %r9
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $22, %rdi
+; AVX512F-ONLY-NEXT:    orq %r9, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %r9
+; AVX512F-ONLY-NEXT:    shlq $23, %r9
+; AVX512F-ONLY-NEXT:    orq %rdi, %r9
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $24, %rdi
+; AVX512F-ONLY-NEXT:    orq %r9, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %r9
+; AVX512F-ONLY-NEXT:    shlq $25, %r9
+; AVX512F-ONLY-NEXT:    orq %rdi, %r9
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $26, %rdi
+; AVX512F-ONLY-NEXT:    orq %r9, %rdi
+; AVX512F-ONLY-NEXT:    shlq $27, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movzbl %cl, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r9
+; AVX512F-ONLY-NEXT:    shlq $28, %r9
+; AVX512F-ONLY-NEXT:    orq %rdx, %r9
+; AVX512F-ONLY-NEXT:    movq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    shlq $29, %rdx
+; AVX512F-ONLY-NEXT:    orq %r9, %rdx
+; AVX512F-ONLY-NEXT:    movq %rdi, %r9
+; AVX512F-ONLY-NEXT:    shlq $30, %r9
+; AVX512F-ONLY-NEXT:    orq %rdx, %r9
+; AVX512F-ONLY-NEXT:    shlq $31, %rdi
+; AVX512F-ONLY-NEXT:    orq %r9, %rdi
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $32, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $33, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $34, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    kmovw %k3, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $35, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $36, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $37, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $38, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $39, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $40, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    shlq $41, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $42, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $43, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $44, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $45, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $46, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $47, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    shlq $48, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    kmovw %k1, %edx
+; AVX512F-ONLY-NEXT:    andl $1, %edx
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $49, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $50, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $51, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $52, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    movq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $53, %rdi
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $54, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rcx
+; AVX512F-ONLY-NEXT:    shlq $55, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    kmovw %k0, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $56, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $57, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $58, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $59, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $60, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $61, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    shlq $62, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    shlq $63, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rax, (%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor7_vf16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k4
+; AVX512DQ-NEXT:    kshiftrw $8, %k4, %k0
+; AVX512DQ-NEXT:    kshiftrw $7, %k4, %k1
+; AVX512DQ-NEXT:    kshiftrw $6, %k4, %k2
+; AVX512DQ-NEXT:    kshiftrw $5, %k4, %k3
+; AVX512DQ-NEXT:    kshiftrw $4, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %ecx
+; AVX512DQ-NEXT:    kshiftrw $3, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %edx
+; AVX512DQ-NEXT:    kshiftrw $2, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %edi
+; AVX512DQ-NEXT:    kshiftrw $1, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %r9d
+; AVX512DQ-NEXT:    kshiftrw $15, %k4, %k5
+; AVX512DQ-NEXT:    kmovw %k5, %r8d
+; AVX512DQ-NEXT:    kshiftrw $14, %k4, %k5
+; AVX512DQ-NEXT:    kshiftrw $13, %k4, %k6
+; AVX512DQ-NEXT:    kmovw %k6, %r10d
+; AVX512DQ-NEXT:    kshiftrw $12, %k4, %k6
+; AVX512DQ-NEXT:    kmovw %k6, %ebx
+; AVX512DQ-NEXT:    kshiftrw $11, %k4, %k6
+; AVX512DQ-NEXT:    kmovw %k6, %ebp
+; AVX512DQ-NEXT:    kshiftrw $10, %k4, %k6
+; AVX512DQ-NEXT:    kmovw %k6, %r14d
+; AVX512DQ-NEXT:    kshiftrw $9, %k4, %k6
+; AVX512DQ-NEXT:    kmovw %k6, %eax
+; AVX512DQ-NEXT:    movzbl %al, %eax
+; AVX512DQ-NEXT:    movl %eax, %r11d
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    leaq (%r11,%r11,2), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%r11,4), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%r11,8), %r15
+; AVX512DQ-NEXT:    movq %r11, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    shlq $5, %r11
+; AVX512DQ-NEXT:    orq %r12, %r11
+; AVX512DQ-NEXT:    movzbl %r14b, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r14
+; AVX512DQ-NEXT:    shlq $6, %r14
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $7, %r12
+; AVX512DQ-NEXT:    orq %r14, %r12
+; AVX512DQ-NEXT:    movq %r15, %r14
+; AVX512DQ-NEXT:    shlq $8, %r14
+; AVX512DQ-NEXT:    orq %r12, %r14
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $9, %r12
+; AVX512DQ-NEXT:    orq %r14, %r12
+; AVX512DQ-NEXT:    movq %r15, %r14
+; AVX512DQ-NEXT:    shlq $10, %r14
+; AVX512DQ-NEXT:    orq %r12, %r14
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $11, %r12
+; AVX512DQ-NEXT:    orq %r14, %r12
+; AVX512DQ-NEXT:    shlq $12, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %bpl, %r14d
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $13, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %r14, %r15
+; AVX512DQ-NEXT:    shlq $14, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $15, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %r14, %r15
+; AVX512DQ-NEXT:    shlq $16, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $17, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %r14, %r15
+; AVX512DQ-NEXT:    shlq $18, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    shlq $19, %r14
+; AVX512DQ-NEXT:    orq %r15, %r14
+; AVX512DQ-NEXT:    movzbl %bl, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rbx
+; AVX512DQ-NEXT:    shlq $20, %rbx
+; AVX512DQ-NEXT:    orq %r14, %rbx
+; AVX512DQ-NEXT:    movq %r15, %r14
+; AVX512DQ-NEXT:    shlq $21, %r14
+; AVX512DQ-NEXT:    orq %rbx, %r14
+; AVX512DQ-NEXT:    movq %r15, %rbx
+; AVX512DQ-NEXT:    shlq $22, %rbx
+; AVX512DQ-NEXT:    orq %r14, %rbx
+; AVX512DQ-NEXT:    movq %r15, %r14
+; AVX512DQ-NEXT:    shlq $23, %r14
+; AVX512DQ-NEXT:    orq %rbx, %r14
+; AVX512DQ-NEXT:    movq %r15, %rbx
+; AVX512DQ-NEXT:    shlq $24, %rbx
+; AVX512DQ-NEXT:    orq %r14, %rbx
+; AVX512DQ-NEXT:    movq %r15, %r14
+; AVX512DQ-NEXT:    shlq $25, %r14
+; AVX512DQ-NEXT:    orq %rbx, %r14
+; AVX512DQ-NEXT:    shlq $26, %r15
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    movzbl %r10b, %ebx
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    movq %rbx, %r14
+; AVX512DQ-NEXT:    shlq $27, %r14
+; AVX512DQ-NEXT:    orq %r15, %r14
+; AVX512DQ-NEXT:    movq %rbx, %r15
+; AVX512DQ-NEXT:    shlq $28, %r15
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    movq %rbx, %r14
+; AVX512DQ-NEXT:    shlq $29, %r14
+; AVX512DQ-NEXT:    orq %r15, %r14
+; AVX512DQ-NEXT:    movq %rbx, %r15
+; AVX512DQ-NEXT:    shlq $30, %r15
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    shlq $31, %rbx
+; AVX512DQ-NEXT:    orq %r15, %rbx
+; AVX512DQ-NEXT:    orq %r11, %rbx
+; AVX512DQ-NEXT:    movl %ebx, 8(%rsi)
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r11
+; AVX512DQ-NEXT:    shlq $32, %r11
+; AVX512DQ-NEXT:    shlq $33, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    kmovw %k5, %r11d
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    movq %r11, %r14
+; AVX512DQ-NEXT:    shlq $34, %r14
+; AVX512DQ-NEXT:    orq %r10, %r14
+; AVX512DQ-NEXT:    movq %r11, %r10
+; AVX512DQ-NEXT:    shlq $35, %r10
+; AVX512DQ-NEXT:    orq %r14, %r10
+; AVX512DQ-NEXT:    movq %r11, %r14
+; AVX512DQ-NEXT:    shlq $36, %r14
+; AVX512DQ-NEXT:    orq %r10, %r14
+; AVX512DQ-NEXT:    movq %r11, %r10
+; AVX512DQ-NEXT:    shlq $37, %r10
+; AVX512DQ-NEXT:    orq %r14, %r10
+; AVX512DQ-NEXT:    movq %r11, %r14
+; AVX512DQ-NEXT:    shlq $38, %r14
+; AVX512DQ-NEXT:    orq %r10, %r14
+; AVX512DQ-NEXT:    movq %r11, %r10
+; AVX512DQ-NEXT:    shlq $39, %r10
+; AVX512DQ-NEXT:    orq %r14, %r10
+; AVX512DQ-NEXT:    shlq $40, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movzbl %r8b, %r10d
+; AVX512DQ-NEXT:    # kill: def $r8d killed $r8d def $r8
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %r14
+; AVX512DQ-NEXT:    shlq $41, %r14
+; AVX512DQ-NEXT:    orq %r11, %r14
+; AVX512DQ-NEXT:    movq %r8, %r11
+; AVX512DQ-NEXT:    shlq $42, %r11
+; AVX512DQ-NEXT:    orq %r14, %r11
+; AVX512DQ-NEXT:    movq %r8, %r14
+; AVX512DQ-NEXT:    shlq $43, %r14
+; AVX512DQ-NEXT:    orq %r11, %r14
+; AVX512DQ-NEXT:    movq %r8, %r11
+; AVX512DQ-NEXT:    shlq $44, %r11
+; AVX512DQ-NEXT:    orq %r14, %r11
+; AVX512DQ-NEXT:    movq %r8, %r14
+; AVX512DQ-NEXT:    shlq $45, %r14
+; AVX512DQ-NEXT:    orq %r11, %r14
+; AVX512DQ-NEXT:    shlq $46, %r8
+; AVX512DQ-NEXT:    orq %r14, %r8
+; AVX512DQ-NEXT:    shlq $47, %r10
+; AVX512DQ-NEXT:    orq %r8, %r10
+; AVX512DQ-NEXT:    orq %rbx, %r10
+; AVX512DQ-NEXT:    shrq $32, %r10
+; AVX512DQ-NEXT:    movw %r10w, 12(%rsi)
+; AVX512DQ-NEXT:    kmovw %k4, %r8d
+; AVX512DQ-NEXT:    movzbl %r8b, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    leaq (%r10,%r10,2), %r8
+; AVX512DQ-NEXT:    leaq (%r8,%r10,4), %r8
+; AVX512DQ-NEXT:    leaq (%r8,%r10,8), %r8
+; AVX512DQ-NEXT:    movq %r10, %r11
+; AVX512DQ-NEXT:    shlq $4, %r11
+; AVX512DQ-NEXT:    orq %r8, %r11
+; AVX512DQ-NEXT:    movq %r10, %r8
+; AVX512DQ-NEXT:    shlq $5, %r8
+; AVX512DQ-NEXT:    orq %r11, %r8
+; AVX512DQ-NEXT:    shlq $6, %r10
+; AVX512DQ-NEXT:    movzbl %r9b, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $7, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $8, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $9, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $10, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    movq %r9, %r11
+; AVX512DQ-NEXT:    shlq $11, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r9, %r10
+; AVX512DQ-NEXT:    shlq $12, %r10
+; AVX512DQ-NEXT:    orq %r11, %r10
+; AVX512DQ-NEXT:    shlq $13, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movzbl %dil, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r10
+; AVX512DQ-NEXT:    shlq $14, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %rdi, %r9
+; AVX512DQ-NEXT:    shlq $15, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %rdi, %r10
+; AVX512DQ-NEXT:    shlq $16, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %rdi, %r9
+; AVX512DQ-NEXT:    shlq $17, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %rdi, %r10
+; AVX512DQ-NEXT:    shlq $18, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %rdi, %r9
+; AVX512DQ-NEXT:    shlq $19, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    shlq $20, %rdi
+; AVX512DQ-NEXT:    orq %r9, %rdi
+; AVX512DQ-NEXT:    movzbl %dl, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %r9
+; AVX512DQ-NEXT:    shlq $21, %r9
+; AVX512DQ-NEXT:    orq %rdi, %r9
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $22, %rdi
+; AVX512DQ-NEXT:    orq %r9, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %r9
+; AVX512DQ-NEXT:    shlq $23, %r9
+; AVX512DQ-NEXT:    orq %rdi, %r9
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $24, %rdi
+; AVX512DQ-NEXT:    orq %r9, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %r9
+; AVX512DQ-NEXT:    shlq $25, %r9
+; AVX512DQ-NEXT:    orq %rdi, %r9
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $26, %rdi
+; AVX512DQ-NEXT:    orq %r9, %rdi
+; AVX512DQ-NEXT:    shlq $27, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movzbl %cl, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r9
+; AVX512DQ-NEXT:    shlq $28, %r9
+; AVX512DQ-NEXT:    orq %rdx, %r9
+; AVX512DQ-NEXT:    movq %rdi, %rdx
+; AVX512DQ-NEXT:    shlq $29, %rdx
+; AVX512DQ-NEXT:    orq %r9, %rdx
+; AVX512DQ-NEXT:    movq %rdi, %r9
+; AVX512DQ-NEXT:    shlq $30, %r9
+; AVX512DQ-NEXT:    orq %rdx, %r9
+; AVX512DQ-NEXT:    shlq $31, %rdi
+; AVX512DQ-NEXT:    orq %r9, %rdi
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $32, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $33, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $34, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    kmovw %k3, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $35, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $36, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $37, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $38, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $39, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $40, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    shlq $41, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    kmovw %k2, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $42, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $43, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $44, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $45, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $46, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $47, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    shlq $48, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    andl $1, %edx
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $49, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $50, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $51, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $52, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    movq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $53, %rdi
+; AVX512DQ-NEXT:    orq %rcx, %rdi
+; AVX512DQ-NEXT:    movq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $54, %rcx
+; AVX512DQ-NEXT:    orq %rdi, %rcx
+; AVX512DQ-NEXT:    shlq $55, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $56, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $57, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $58, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $59, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rdi
+; AVX512DQ-NEXT:    shlq $60, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $61, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    shlq $62, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    shlq $63, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rax, (%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor7_vf16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovw (%rdi), %k4
+; AVX512BW-NEXT:    kshiftrw $8, %k4, %k0
+; AVX512BW-NEXT:    kshiftrw $7, %k4, %k1
+; AVX512BW-NEXT:    kshiftrw $6, %k4, %k2
+; AVX512BW-NEXT:    kshiftrw $5, %k4, %k3
+; AVX512BW-NEXT:    kshiftrw $4, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %ecx
+; AVX512BW-NEXT:    kshiftrw $3, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %edx
+; AVX512BW-NEXT:    kshiftrw $2, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %edi
+; AVX512BW-NEXT:    kshiftrw $1, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %r9d
+; AVX512BW-NEXT:    kshiftrw $15, %k4, %k5
+; AVX512BW-NEXT:    kmovd %k5, %r8d
+; AVX512BW-NEXT:    kshiftrw $14, %k4, %k5
+; AVX512BW-NEXT:    kshiftrw $13, %k4, %k6
+; AVX512BW-NEXT:    kmovd %k6, %r10d
+; AVX512BW-NEXT:    kshiftrw $12, %k4, %k6
+; AVX512BW-NEXT:    kmovd %k6, %ebx
+; AVX512BW-NEXT:    kshiftrw $11, %k4, %k6
+; AVX512BW-NEXT:    kmovd %k6, %ebp
+; AVX512BW-NEXT:    kshiftrw $10, %k4, %k6
+; AVX512BW-NEXT:    kmovd %k6, %r14d
+; AVX512BW-NEXT:    kshiftrw $9, %k4, %k6
+; AVX512BW-NEXT:    kmovd %k6, %eax
+; AVX512BW-NEXT:    movzbl %al, %eax
+; AVX512BW-NEXT:    movl %eax, %r11d
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    leaq (%r11,%r11,2), %r15
+; AVX512BW-NEXT:    leaq (%r15,%r11,4), %r15
+; AVX512BW-NEXT:    leaq (%r15,%r11,8), %r15
+; AVX512BW-NEXT:    movq %r11, %r12
+; AVX512BW-NEXT:    shlq $4, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    shlq $5, %r11
+; AVX512BW-NEXT:    orq %r12, %r11
+; AVX512BW-NEXT:    movzbl %r14b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r14
+; AVX512BW-NEXT:    shlq $6, %r14
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    movq %r15, %r14
+; AVX512BW-NEXT:    shlq $8, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    movq %r15, %r14
+; AVX512BW-NEXT:    shlq $10, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $11, %r12
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    shlq $12, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %bpl, %r14d
+; AVX512BW-NEXT:    andl $1, %r14d
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $13, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movq %r14, %r15
+; AVX512BW-NEXT:    shlq $14, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $15, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movq %r14, %r15
+; AVX512BW-NEXT:    shlq $16, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movq %r14, %r12
+; AVX512BW-NEXT:    shlq $17, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movq %r14, %r15
+; AVX512BW-NEXT:    shlq $18, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    shlq $19, %r14
+; AVX512BW-NEXT:    orq %r15, %r14
+; AVX512BW-NEXT:    movzbl %bl, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rbx
+; AVX512BW-NEXT:    shlq $20, %rbx
+; AVX512BW-NEXT:    orq %r14, %rbx
+; AVX512BW-NEXT:    movq %r15, %r14
+; AVX512BW-NEXT:    shlq $21, %r14
+; AVX512BW-NEXT:    orq %rbx, %r14
+; AVX512BW-NEXT:    movq %r15, %rbx
+; AVX512BW-NEXT:    shlq $22, %rbx
+; AVX512BW-NEXT:    orq %r14, %rbx
+; AVX512BW-NEXT:    movq %r15, %r14
+; AVX512BW-NEXT:    shlq $23, %r14
+; AVX512BW-NEXT:    orq %rbx, %r14
+; AVX512BW-NEXT:    movq %r15, %rbx
+; AVX512BW-NEXT:    shlq $24, %rbx
+; AVX512BW-NEXT:    orq %r14, %rbx
+; AVX512BW-NEXT:    movq %r15, %r14
+; AVX512BW-NEXT:    shlq $25, %r14
+; AVX512BW-NEXT:    orq %rbx, %r14
+; AVX512BW-NEXT:    shlq $26, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    movzbl %r10b, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %r14
+; AVX512BW-NEXT:    shlq $27, %r14
+; AVX512BW-NEXT:    orq %r15, %r14
+; AVX512BW-NEXT:    movq %rbx, %r15
+; AVX512BW-NEXT:    shlq $28, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    movq %rbx, %r14
+; AVX512BW-NEXT:    shlq $29, %r14
+; AVX512BW-NEXT:    orq %r15, %r14
+; AVX512BW-NEXT:    movq %rbx, %r15
+; AVX512BW-NEXT:    shlq $30, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    shlq $31, %rbx
+; AVX512BW-NEXT:    orq %r15, %rbx
+; AVX512BW-NEXT:    orq %r11, %rbx
+; AVX512BW-NEXT:    movl %ebx, 8(%rsi)
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %r11
+; AVX512BW-NEXT:    shlq $32, %r11
+; AVX512BW-NEXT:    shlq $33, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    kmovd %k5, %r11d
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %r14
+; AVX512BW-NEXT:    shlq $34, %r14
+; AVX512BW-NEXT:    orq %r10, %r14
+; AVX512BW-NEXT:    movq %r11, %r10
+; AVX512BW-NEXT:    shlq $35, %r10
+; AVX512BW-NEXT:    orq %r14, %r10
+; AVX512BW-NEXT:    movq %r11, %r14
+; AVX512BW-NEXT:    shlq $36, %r14
+; AVX512BW-NEXT:    orq %r10, %r14
+; AVX512BW-NEXT:    movq %r11, %r10
+; AVX512BW-NEXT:    shlq $37, %r10
+; AVX512BW-NEXT:    orq %r14, %r10
+; AVX512BW-NEXT:    movq %r11, %r14
+; AVX512BW-NEXT:    shlq $38, %r14
+; AVX512BW-NEXT:    orq %r10, %r14
+; AVX512BW-NEXT:    movq %r11, %r10
+; AVX512BW-NEXT:    shlq $39, %r10
+; AVX512BW-NEXT:    orq %r14, %r10
+; AVX512BW-NEXT:    shlq $40, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movzbl %r8b, %r10d
+; AVX512BW-NEXT:    # kill: def $r8d killed $r8d def $r8
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %r14
+; AVX512BW-NEXT:    shlq $41, %r14
+; AVX512BW-NEXT:    orq %r11, %r14
+; AVX512BW-NEXT:    movq %r8, %r11
+; AVX512BW-NEXT:    shlq $42, %r11
+; AVX512BW-NEXT:    orq %r14, %r11
+; AVX512BW-NEXT:    movq %r8, %r14
+; AVX512BW-NEXT:    shlq $43, %r14
+; AVX512BW-NEXT:    orq %r11, %r14
+; AVX512BW-NEXT:    movq %r8, %r11
+; AVX512BW-NEXT:    shlq $44, %r11
+; AVX512BW-NEXT:    orq %r14, %r11
+; AVX512BW-NEXT:    movq %r8, %r14
+; AVX512BW-NEXT:    shlq $45, %r14
+; AVX512BW-NEXT:    orq %r11, %r14
+; AVX512BW-NEXT:    shlq $46, %r8
+; AVX512BW-NEXT:    orq %r14, %r8
+; AVX512BW-NEXT:    shlq $47, %r10
+; AVX512BW-NEXT:    orq %r8, %r10
+; AVX512BW-NEXT:    orq %rbx, %r10
+; AVX512BW-NEXT:    shrq $32, %r10
+; AVX512BW-NEXT:    movw %r10w, 12(%rsi)
+; AVX512BW-NEXT:    kmovd %k4, %r8d
+; AVX512BW-NEXT:    movzbl %r8b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    leaq (%r10,%r10,2), %r8
+; AVX512BW-NEXT:    leaq (%r8,%r10,4), %r8
+; AVX512BW-NEXT:    leaq (%r8,%r10,8), %r8
+; AVX512BW-NEXT:    movq %r10, %r11
+; AVX512BW-NEXT:    shlq $4, %r11
+; AVX512BW-NEXT:    orq %r8, %r11
+; AVX512BW-NEXT:    movq %r10, %r8
+; AVX512BW-NEXT:    shlq $5, %r8
+; AVX512BW-NEXT:    orq %r11, %r8
+; AVX512BW-NEXT:    shlq $6, %r10
+; AVX512BW-NEXT:    movzbl %r9b, %r9d
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $7, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $8, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $9, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $10, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    movq %r9, %r11
+; AVX512BW-NEXT:    shlq $11, %r11
+; AVX512BW-NEXT:    orq %r10, %r11
+; AVX512BW-NEXT:    movq %r9, %r10
+; AVX512BW-NEXT:    shlq $12, %r10
+; AVX512BW-NEXT:    orq %r11, %r10
+; AVX512BW-NEXT:    shlq $13, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movzbl %dil, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r10
+; AVX512BW-NEXT:    shlq $14, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %rdi, %r9
+; AVX512BW-NEXT:    shlq $15, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movq %rdi, %r10
+; AVX512BW-NEXT:    shlq $16, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %rdi, %r9
+; AVX512BW-NEXT:    shlq $17, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    movq %rdi, %r10
+; AVX512BW-NEXT:    shlq $18, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    movq %rdi, %r9
+; AVX512BW-NEXT:    shlq $19, %r9
+; AVX512BW-NEXT:    orq %r10, %r9
+; AVX512BW-NEXT:    shlq $20, %rdi
+; AVX512BW-NEXT:    orq %r9, %rdi
+; AVX512BW-NEXT:    movzbl %dl, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %r9
+; AVX512BW-NEXT:    shlq $21, %r9
+; AVX512BW-NEXT:    orq %rdi, %r9
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $22, %rdi
+; AVX512BW-NEXT:    orq %r9, %rdi
+; AVX512BW-NEXT:    movq %rdx, %r9
+; AVX512BW-NEXT:    shlq $23, %r9
+; AVX512BW-NEXT:    orq %rdi, %r9
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $24, %rdi
+; AVX512BW-NEXT:    orq %r9, %rdi
+; AVX512BW-NEXT:    movq %rdx, %r9
+; AVX512BW-NEXT:    shlq $25, %r9
+; AVX512BW-NEXT:    orq %rdi, %r9
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $26, %rdi
+; AVX512BW-NEXT:    orq %r9, %rdi
+; AVX512BW-NEXT:    shlq $27, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movzbl %cl, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r9
+; AVX512BW-NEXT:    shlq $28, %r9
+; AVX512BW-NEXT:    orq %rdx, %r9
+; AVX512BW-NEXT:    movq %rdi, %rdx
+; AVX512BW-NEXT:    shlq $29, %rdx
+; AVX512BW-NEXT:    orq %r9, %rdx
+; AVX512BW-NEXT:    movq %rdi, %r9
+; AVX512BW-NEXT:    shlq $30, %r9
+; AVX512BW-NEXT:    orq %rdx, %r9
+; AVX512BW-NEXT:    shlq $31, %rdi
+; AVX512BW-NEXT:    orq %r9, %rdi
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $32, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $33, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $34, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    kmovd %k3, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $35, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rcx
+; AVX512BW-NEXT:    shlq $36, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $37, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rcx
+; AVX512BW-NEXT:    shlq $38, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $39, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rcx
+; AVX512BW-NEXT:    shlq $40, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    shlq $41, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    kmovd %k2, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $42, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $43, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $44, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $45, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $46, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $47, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    shlq $48, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $49, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rcx
+; AVX512BW-NEXT:    shlq $50, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $51, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rcx
+; AVX512BW-NEXT:    shlq $52, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $53, %rdi
+; AVX512BW-NEXT:    orq %rcx, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rcx
+; AVX512BW-NEXT:    shlq $54, %rcx
+; AVX512BW-NEXT:    orq %rdi, %rcx
+; AVX512BW-NEXT:    shlq $55, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $56, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $57, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $58, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $59, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rdi
+; AVX512BW-NEXT:    shlq $60, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $61, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    shlq $62, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    shlq $63, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rax, (%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <16 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  store <112 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor7_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor7_vf32:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r13
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k2
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k2
+; AVX512F-ONLY-NEXT:    movl %r8d, %r9d
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    leal (%r9,%r9,2), %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%r9,4), %eax
+; AVX512F-ONLY-NEXT:    leal (%rax,%r9,8), %eax
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    shll $4, %r12d
+; AVX512F-ONLY-NEXT:    orl %eax, %r12d
+; AVX512F-ONLY-NEXT:    movl %r15d, %ebx
+; AVX512F-ONLY-NEXT:    shll $5, %ebx
+; AVX512F-ONLY-NEXT:    orl %r12d, %ebx
+; AVX512F-ONLY-NEXT:    movl %r15d, %eax
+; AVX512F-ONLY-NEXT:    shll $6, %eax
+; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    shll $7, %r12d
+; AVX512F-ONLY-NEXT:    orl %eax, %r12d
+; AVX512F-ONLY-NEXT:    movl %r15d, %eax
+; AVX512F-ONLY-NEXT:    shll $8, %eax
+; AVX512F-ONLY-NEXT:    orl %r12d, %eax
+; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
+; AVX512F-ONLY-NEXT:    shll $9, %r13d
+; AVX512F-ONLY-NEXT:    orl %eax, %r13d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k2
+; AVX512F-ONLY-NEXT:    shll $10, %r15d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movl %r10d, %eax
+; AVX512F-ONLY-NEXT:    shll $11, %eax
+; AVX512F-ONLY-NEXT:    orl %r15d, %eax
+; AVX512F-ONLY-NEXT:    movl %r10d, %r15d
+; AVX512F-ONLY-NEXT:    shll $12, %r15d
+; AVX512F-ONLY-NEXT:    orl %eax, %r15d
+; AVX512F-ONLY-NEXT:    movl %r10d, %eax
+; AVX512F-ONLY-NEXT:    shll $13, %eax
+; AVX512F-ONLY-NEXT:    orl %r15d, %eax
+; AVX512F-ONLY-NEXT:    movl %r10d, %r15d
+; AVX512F-ONLY-NEXT:    shll $14, %r15d
+; AVX512F-ONLY-NEXT:    orl %eax, %r15d
+; AVX512F-ONLY-NEXT:    movl %r10d, %eax
+; AVX512F-ONLY-NEXT:    shll $15, %eax
+; AVX512F-ONLY-NEXT:    orl %r15d, %eax
+; AVX512F-ONLY-NEXT:    movl %r10d, %r15d
+; AVX512F-ONLY-NEXT:    shll $16, %r15d
+; AVX512F-ONLY-NEXT:    orl %eax, %r15d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512F-ONLY-NEXT:    shll $17, %r10d
+; AVX512F-ONLY-NEXT:    orl %r15d, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
+; AVX512F-ONLY-NEXT:    shll $18, %r15d
+; AVX512F-ONLY-NEXT:    orl %r10d, %r15d
+; AVX512F-ONLY-NEXT:    movl %r12d, %r10d
+; AVX512F-ONLY-NEXT:    shll $19, %r10d
+; AVX512F-ONLY-NEXT:    orl %r15d, %r10d
+; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
+; AVX512F-ONLY-NEXT:    shll $20, %r15d
+; AVX512F-ONLY-NEXT:    orl %r10d, %r15d
+; AVX512F-ONLY-NEXT:    movl %r12d, %r10d
+; AVX512F-ONLY-NEXT:    shll $21, %r10d
+; AVX512F-ONLY-NEXT:    orl %r15d, %r10d
+; AVX512F-ONLY-NEXT:    movl %r12d, %r15d
+; AVX512F-ONLY-NEXT:    shll $22, %r15d
+; AVX512F-ONLY-NEXT:    orl %r10d, %r15d
+; AVX512F-ONLY-NEXT:    movl %r12d, %r10d
+; AVX512F-ONLY-NEXT:    shll $23, %r10d
+; AVX512F-ONLY-NEXT:    orl %r15d, %r10d
+; AVX512F-ONLY-NEXT:    shll $24, %r12d
+; AVX512F-ONLY-NEXT:    orl %r10d, %r12d
+; AVX512F-ONLY-NEXT:    movl %eax, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r10d
+; AVX512F-ONLY-NEXT:    shll $25, %r10d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r10d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    shll $26, %r12d
+; AVX512F-ONLY-NEXT:    orl %r10d, %r12d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r10d
+; AVX512F-ONLY-NEXT:    shll $27, %r10d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r10d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r12d
+; AVX512F-ONLY-NEXT:    shll $28, %r12d
+; AVX512F-ONLY-NEXT:    orl %r10d, %r12d
+; AVX512F-ONLY-NEXT:    movl %r15d, %r13d
+; AVX512F-ONLY-NEXT:    shll $29, %r13d
+; AVX512F-ONLY-NEXT:    orl %r12d, %r13d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512F-ONLY-NEXT:    shll $30, %r15d
+; AVX512F-ONLY-NEXT:    orl %r13d, %r15d
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512F-ONLY-NEXT:    shll $31, %eax
+; AVX512F-ONLY-NEXT:    orl %r15d, %eax
+; AVX512F-ONLY-NEXT:    orl %ebx, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, 24(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k1, %eax
+; AVX512F-ONLY-NEXT:    movzbl %al, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %rbx
+; AVX512F-ONLY-NEXT:    leaq (%rbx,%rax,4), %rbx
+; AVX512F-ONLY-NEXT:    leaq (%rbx,%rax,8), %rbx
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $4, %r15
+; AVX512F-ONLY-NEXT:    orq %rbx, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $5, %rbx
+; AVX512F-ONLY-NEXT:    orq %r15, %rbx
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $9, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $11, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $13, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $15, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $17, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $19, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $20, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $21, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $22, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $24, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $25, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $26, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $27, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $28, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $29, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $30, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $32, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $34, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $36, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $37, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $38, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $39, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $40, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $43, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $45, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r13
+; AVX512F-ONLY-NEXT:    shlq $47, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $48, %r10
+; AVX512F-ONLY-NEXT:    orq %r13, %r10
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $50, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $51, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $52, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $53, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $54, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k2
+; AVX512F-ONLY-NEXT:    shlq $55, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $56, %r10
+; AVX512F-ONLY-NEXT:    orq %r12, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $57, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $58, %r10
+; AVX512F-ONLY-NEXT:    orq %r12, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $59, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $60, %r10
+; AVX512F-ONLY-NEXT:    orq %r12, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $62, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r15
+; AVX512F-ONLY-NEXT:    shlq $63, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    orq %rbx, %r15
+; AVX512F-ONLY-NEXT:    movq %r15, (%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %r11d
+; AVX512F-ONLY-NEXT:    movl %r11d, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    leaq (%rbx,%rbx,2), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,4), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,8), %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $5, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $9, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512F-ONLY-NEXT:    shlq $11, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $13, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $15, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $17, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512F-ONLY-NEXT:    shlq $18, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $19, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $20, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $21, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $22, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $24, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $25, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $26, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $27, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $28, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $29, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $30, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k2, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k2
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k2
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    shlq $32, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $34, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $36, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $37, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $38, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k2
+; AVX512F-ONLY-NEXT:    shlq $39, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $43, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r13
+; AVX512F-ONLY-NEXT:    shlq $45, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k2
+; AVX512F-ONLY-NEXT:    shlq $46, %r14
+; AVX512F-ONLY-NEXT:    orq %r13, %r14
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $48, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $50, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $51, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $52, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-ONLY-NEXT:    shlq $53, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $55, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $57, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $58, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %r12
+; AVX512F-ONLY-NEXT:    shlq $59, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
+; AVX512F-ONLY-NEXT:    shlq $60, %r14
+; AVX512F-ONLY-NEXT:    orq %r12, %r14
+; AVX512F-ONLY-NEXT:    movq %r9, %r12
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %r14, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r14d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512F-ONLY-NEXT:    shlq $62, %r9
+; AVX512F-ONLY-NEXT:    orq %r12, %r9
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r12d
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %eax
+; AVX512F-ONLY-NEXT:    shlq $63, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %rax, 16(%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    leaq (%r10,%r10,2), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r10,4), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r10,8), %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r8
+; AVX512F-ONLY-NEXT:    shlq $4, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $5, %r10
+; AVX512F-ONLY-NEXT:    orq %r8, %r10
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $6, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $7, %r9
+; AVX512F-ONLY-NEXT:    orq %r8, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $8, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $9, %r9
+; AVX512F-ONLY-NEXT:    orq %r8, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $10, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $11, %r9
+; AVX512F-ONLY-NEXT:    orq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $13, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $15, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %r8
+; AVX512F-ONLY-NEXT:    shlq $17, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $19, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    movzbl %dl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $20, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $21, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $22, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $23, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $24, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $25, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $26, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movzbl %cl, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $27, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $28, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $29, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $30, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $31, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $35, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $37, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $39, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $40, %r13
+; AVX512F-ONLY-NEXT:    orq %rcx, %r13
+; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $41, %rcx
+; AVX512F-ONLY-NEXT:    orq %r13, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $42, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $43, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $44, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $45, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $46, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r14d
+; AVX512F-ONLY-NEXT:    movq %r14, %rcx
+; AVX512F-ONLY-NEXT:    shlq $48, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rcx
+; AVX512F-ONLY-NEXT:    shlq $50, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $51, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r14, %rcx
+; AVX512F-ONLY-NEXT:    shlq $52, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r14, %rax
+; AVX512F-ONLY-NEXT:    shlq $53, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %r14
+; AVX512F-ONLY-NEXT:    orq %rax, %r14
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %r14, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $56, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $58, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $59, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $60, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %rcx, %r12
+; AVX512F-ONLY-NEXT:    shlq $62, %rbx
+; AVX512F-ONLY-NEXT:    orq %r12, %rbx
+; AVX512F-ONLY-NEXT:    shlq $63, %r11
+; AVX512F-ONLY-NEXT:    orq %rbx, %r11
+; AVX512F-ONLY-NEXT:    orq %r10, %r11
+; AVX512F-ONLY-NEXT:    movq %r11, 8(%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r13
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor7_vf32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %ecx
+; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %edx
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %edi
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r8d
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r11d
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %r10d
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k2
+; AVX512DQ-NEXT:    movl %r8d, %r9d
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    leal (%r9,%r9,2), %eax
+; AVX512DQ-NEXT:    leal (%rax,%r9,4), %eax
+; AVX512DQ-NEXT:    leal (%rax,%r9,8), %eax
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movl %r15d, %r12d
+; AVX512DQ-NEXT:    shll $4, %r12d
+; AVX512DQ-NEXT:    orl %eax, %r12d
+; AVX512DQ-NEXT:    movl %r15d, %ebx
+; AVX512DQ-NEXT:    shll $5, %ebx
+; AVX512DQ-NEXT:    orl %r12d, %ebx
+; AVX512DQ-NEXT:    movl %r15d, %eax
+; AVX512DQ-NEXT:    shll $6, %eax
+; AVX512DQ-NEXT:    movl %r15d, %r12d
+; AVX512DQ-NEXT:    shll $7, %r12d
+; AVX512DQ-NEXT:    orl %eax, %r12d
+; AVX512DQ-NEXT:    movl %r15d, %eax
+; AVX512DQ-NEXT:    shll $8, %eax
+; AVX512DQ-NEXT:    orl %r12d, %eax
+; AVX512DQ-NEXT:    movl %r15d, %r13d
+; AVX512DQ-NEXT:    shll $9, %r13d
+; AVX512DQ-NEXT:    orl %eax, %r13d
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k2
+; AVX512DQ-NEXT:    shll $10, %r15d
+; AVX512DQ-NEXT:    orl %r13d, %r15d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movl %r10d, %eax
+; AVX512DQ-NEXT:    shll $11, %eax
+; AVX512DQ-NEXT:    orl %r15d, %eax
+; AVX512DQ-NEXT:    movl %r10d, %r15d
+; AVX512DQ-NEXT:    shll $12, %r15d
+; AVX512DQ-NEXT:    orl %eax, %r15d
+; AVX512DQ-NEXT:    movl %r10d, %eax
+; AVX512DQ-NEXT:    shll $13, %eax
+; AVX512DQ-NEXT:    orl %r15d, %eax
+; AVX512DQ-NEXT:    movl %r10d, %r15d
+; AVX512DQ-NEXT:    shll $14, %r15d
+; AVX512DQ-NEXT:    orl %eax, %r15d
+; AVX512DQ-NEXT:    movl %r10d, %eax
+; AVX512DQ-NEXT:    shll $15, %eax
+; AVX512DQ-NEXT:    orl %r15d, %eax
+; AVX512DQ-NEXT:    movl %r10d, %r15d
+; AVX512DQ-NEXT:    shll $16, %r15d
+; AVX512DQ-NEXT:    orl %eax, %r15d
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k2
+; AVX512DQ-NEXT:    shll $17, %r10d
+; AVX512DQ-NEXT:    orl %r15d, %r10d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movl %r12d, %r15d
+; AVX512DQ-NEXT:    shll $18, %r15d
+; AVX512DQ-NEXT:    orl %r10d, %r15d
+; AVX512DQ-NEXT:    movl %r12d, %r10d
+; AVX512DQ-NEXT:    shll $19, %r10d
+; AVX512DQ-NEXT:    orl %r15d, %r10d
+; AVX512DQ-NEXT:    movl %r12d, %r15d
+; AVX512DQ-NEXT:    shll $20, %r15d
+; AVX512DQ-NEXT:    orl %r10d, %r15d
+; AVX512DQ-NEXT:    movl %r12d, %r10d
+; AVX512DQ-NEXT:    shll $21, %r10d
+; AVX512DQ-NEXT:    orl %r15d, %r10d
+; AVX512DQ-NEXT:    movl %r12d, %r15d
+; AVX512DQ-NEXT:    shll $22, %r15d
+; AVX512DQ-NEXT:    orl %r10d, %r15d
+; AVX512DQ-NEXT:    movl %r12d, %r10d
+; AVX512DQ-NEXT:    shll $23, %r10d
+; AVX512DQ-NEXT:    orl %r15d, %r10d
+; AVX512DQ-NEXT:    shll $24, %r12d
+; AVX512DQ-NEXT:    orl %r10d, %r12d
+; AVX512DQ-NEXT:    movl %eax, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movl %r15d, %r10d
+; AVX512DQ-NEXT:    shll $25, %r10d
+; AVX512DQ-NEXT:    orl %r12d, %r10d
+; AVX512DQ-NEXT:    movl %r15d, %r12d
+; AVX512DQ-NEXT:    shll $26, %r12d
+; AVX512DQ-NEXT:    orl %r10d, %r12d
+; AVX512DQ-NEXT:    movl %r15d, %r10d
+; AVX512DQ-NEXT:    shll $27, %r10d
+; AVX512DQ-NEXT:    orl %r12d, %r10d
+; AVX512DQ-NEXT:    movl %r15d, %r12d
+; AVX512DQ-NEXT:    shll $28, %r12d
+; AVX512DQ-NEXT:    orl %r10d, %r12d
+; AVX512DQ-NEXT:    movl %r15d, %r13d
+; AVX512DQ-NEXT:    shll $29, %r13d
+; AVX512DQ-NEXT:    orl %r12d, %r13d
+; AVX512DQ-NEXT:    kmovw %k2, %r10d
+; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k2
+; AVX512DQ-NEXT:    shll $30, %r15d
+; AVX512DQ-NEXT:    orl %r13d, %r15d
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k2
+; AVX512DQ-NEXT:    shll $31, %eax
+; AVX512DQ-NEXT:    orl %r15d, %eax
+; AVX512DQ-NEXT:    orl %ebx, %eax
+; AVX512DQ-NEXT:    movl %eax, 24(%rsi)
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    movzbl %al, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %rbx
+; AVX512DQ-NEXT:    leaq (%rbx,%rax,4), %rbx
+; AVX512DQ-NEXT:    leaq (%rbx,%rax,8), %rbx
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $4, %r15
+; AVX512DQ-NEXT:    orq %rbx, %r15
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $5, %rbx
+; AVX512DQ-NEXT:    orq %r15, %rbx
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movzbl %r12b, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $7, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $9, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $11, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k2
+; AVX512DQ-NEXT:    shlq $13, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movzbl %r12b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $15, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $17, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $19, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k2
+; AVX512DQ-NEXT:    shlq $20, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $21, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $22, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $24, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $25, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $26, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $27, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %r10b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $28, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $29, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $30, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k2
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $32, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k2
+; AVX512DQ-NEXT:    shlq $34, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $36, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $37, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $38, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $39, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $40, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r10d
+; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k2
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $43, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $45, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r13
+; AVX512DQ-NEXT:    shlq $47, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512DQ-NEXT:    shlq $48, %r10
+; AVX512DQ-NEXT:    orq %r13, %r10
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $49, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $50, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $51, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $52, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $53, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $54, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k2
+; AVX512DQ-NEXT:    shlq $55, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $56, %r10
+; AVX512DQ-NEXT:    orq %r12, %r10
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $57, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $58, %r10
+; AVX512DQ-NEXT:    orq %r12, %r10
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $59, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $60, %r10
+; AVX512DQ-NEXT:    orq %r12, %r10
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    shlq $62, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %r15b, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r15
+; AVX512DQ-NEXT:    shlq $63, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    orq %rbx, %r15
+; AVX512DQ-NEXT:    movq %r15, (%rsi)
+; AVX512DQ-NEXT:    movzbl %r11b, %r11d
+; AVX512DQ-NEXT:    movl %r11d, %ebx
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    leaq (%rbx,%rbx,2), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%rbx,4), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%rbx,8), %rax
+; AVX512DQ-NEXT:    movq %rbx, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $5, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $7, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $9, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k2
+; AVX512DQ-NEXT:    shlq $11, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $13, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $15, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $17, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %eax
+; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k2
+; AVX512DQ-NEXT:    shlq $18, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $19, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $20, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $21, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $22, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $24, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $25, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %r14b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $26, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $27, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $28, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $29, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $30, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k2, %ebp
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k2
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k2
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    shlq $32, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $34, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $36, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $37, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $38, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k2
+; AVX512DQ-NEXT:    shlq $39, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $43, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %r13
+; AVX512DQ-NEXT:    shlq $45, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k2
+; AVX512DQ-NEXT:    shlq $46, %r14
+; AVX512DQ-NEXT:    orq %r13, %r14
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $48, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $49, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $50, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $51, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $52, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r14d
+; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512DQ-NEXT:    shlq $53, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $55, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $57, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $58, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r14, %r12
+; AVX512DQ-NEXT:    shlq $59, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k1, %r13d
+; AVX512DQ-NEXT:    shlq $60, %r14
+; AVX512DQ-NEXT:    orq %r12, %r14
+; AVX512DQ-NEXT:    movq %r9, %r12
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %r14, %r12
+; AVX512DQ-NEXT:    kmovw %k0, %r14d
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512DQ-NEXT:    shlq $62, %r9
+; AVX512DQ-NEXT:    orq %r12, %r9
+; AVX512DQ-NEXT:    kmovw %k0, %r12d
+; AVX512DQ-NEXT:    movzbl %r8b, %eax
+; AVX512DQ-NEXT:    shlq $63, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %rax, 16(%rsi)
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    leaq (%r10,%r10,2), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%r10,4), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%r10,8), %rax
+; AVX512DQ-NEXT:    movq %r10, %r8
+; AVX512DQ-NEXT:    shlq $4, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    shlq $5, %r10
+; AVX512DQ-NEXT:    orq %r8, %r10
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $6, %r8
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $7, %r9
+; AVX512DQ-NEXT:    orq %r8, %r9
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $8, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $9, %r9
+; AVX512DQ-NEXT:    orq %r8, %r9
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $10, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $11, %r9
+; AVX512DQ-NEXT:    orq %r8, %r9
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movzbl %dil, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $13, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $15, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %rdi, %r8
+; AVX512DQ-NEXT:    shlq $17, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    shlq $19, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    movzbl %dl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $20, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $21, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $22, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $23, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $24, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $25, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $26, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movzbl %cl, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $27, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $28, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $29, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $30, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $31, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $33, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $35, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $37, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $39, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $40, %r13
+; AVX512DQ-NEXT:    orq %rcx, %r13
+; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $41, %rcx
+; AVX512DQ-NEXT:    orq %r13, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $42, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $43, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $44, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $45, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $46, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    andl $1, %r14d
+; AVX512DQ-NEXT:    movq %r14, %rcx
+; AVX512DQ-NEXT:    shlq $48, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $49, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r14, %rcx
+; AVX512DQ-NEXT:    shlq $50, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $51, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r14, %rcx
+; AVX512DQ-NEXT:    shlq $52, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r14, %rax
+; AVX512DQ-NEXT:    shlq $53, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $54, %r14
+; AVX512DQ-NEXT:    orq %rax, %r14
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %r14, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $56, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $58, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $59, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $60, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %rcx, %r12
+; AVX512DQ-NEXT:    shlq $62, %rbx
+; AVX512DQ-NEXT:    orq %r12, %rbx
+; AVX512DQ-NEXT:    shlq $63, %r11
+; AVX512DQ-NEXT:    orq %rbx, %r11
+; AVX512DQ-NEXT:    orq %r10, %r11
+; AVX512DQ-NEXT:    movq %r11, 8(%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor7_vf32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r13
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrd $13, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrd $12, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrd $11, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrd $10, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrd $27, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrd $22, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrd $19, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrd $18, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrd $28, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $29, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrd $30, %k0, %k1
+; AVX512BW-NEXT:    movl %r9d, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    leal (%r10,%r10,2), %r14d
+; AVX512BW-NEXT:    leal (%r14,%r10,4), %r14d
+; AVX512BW-NEXT:    leal (%r14,%r10,8), %r14d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movl %r15d, %r12d
+; AVX512BW-NEXT:    shll $4, %r12d
+; AVX512BW-NEXT:    orl %r14d, %r12d
+; AVX512BW-NEXT:    movl %r15d, %r14d
+; AVX512BW-NEXT:    shll $5, %r14d
+; AVX512BW-NEXT:    orl %r12d, %r14d
+; AVX512BW-NEXT:    movl %r15d, %r12d
+; AVX512BW-NEXT:    shll $6, %r12d
+; AVX512BW-NEXT:    movl %r15d, %r13d
+; AVX512BW-NEXT:    shll $7, %r13d
+; AVX512BW-NEXT:    orl %r12d, %r13d
+; AVX512BW-NEXT:    movl %r15d, %r12d
+; AVX512BW-NEXT:    shll $8, %r12d
+; AVX512BW-NEXT:    orl %r13d, %r12d
+; AVX512BW-NEXT:    movl %r15d, %r13d
+; AVX512BW-NEXT:    shll $9, %r13d
+; AVX512BW-NEXT:    orl %r12d, %r13d
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $31, %k0, %k1
+; AVX512BW-NEXT:    shll $10, %r15d
+; AVX512BW-NEXT:    orl %r13d, %r15d
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movl %ebx, %r13d
+; AVX512BW-NEXT:    shll $11, %r13d
+; AVX512BW-NEXT:    orl %r15d, %r13d
+; AVX512BW-NEXT:    movl %ebx, %r15d
+; AVX512BW-NEXT:    shll $12, %r15d
+; AVX512BW-NEXT:    orl %r13d, %r15d
+; AVX512BW-NEXT:    movl %ebx, %r13d
+; AVX512BW-NEXT:    shll $13, %r13d
+; AVX512BW-NEXT:    orl %r15d, %r13d
+; AVX512BW-NEXT:    movl %ebx, %r15d
+; AVX512BW-NEXT:    shll $14, %r15d
+; AVX512BW-NEXT:    orl %r13d, %r15d
+; AVX512BW-NEXT:    movl %ebx, %r13d
+; AVX512BW-NEXT:    shll $15, %r13d
+; AVX512BW-NEXT:    orl %r15d, %r13d
+; AVX512BW-NEXT:    movl %ebx, %eax
+; AVX512BW-NEXT:    shll $16, %eax
+; AVX512BW-NEXT:    orl %r13d, %eax
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $4, %k0, %k1
+; AVX512BW-NEXT:    shll $17, %ebx
+; AVX512BW-NEXT:    orl %eax, %ebx
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movl %r12d, %eax
+; AVX512BW-NEXT:    shll $18, %eax
+; AVX512BW-NEXT:    orl %ebx, %eax
+; AVX512BW-NEXT:    movl %r12d, %ebx
+; AVX512BW-NEXT:    shll $19, %ebx
+; AVX512BW-NEXT:    orl %eax, %ebx
+; AVX512BW-NEXT:    movl %r12d, %eax
+; AVX512BW-NEXT:    shll $20, %eax
+; AVX512BW-NEXT:    orl %ebx, %eax
+; AVX512BW-NEXT:    movl %r12d, %ebx
+; AVX512BW-NEXT:    shll $21, %ebx
+; AVX512BW-NEXT:    orl %eax, %ebx
+; AVX512BW-NEXT:    movl %r12d, %eax
+; AVX512BW-NEXT:    shll $22, %eax
+; AVX512BW-NEXT:    orl %ebx, %eax
+; AVX512BW-NEXT:    movl %r12d, %ebx
+; AVX512BW-NEXT:    shll $23, %ebx
+; AVX512BW-NEXT:    orl %eax, %ebx
+; AVX512BW-NEXT:    shll $24, %r12d
+; AVX512BW-NEXT:    orl %ebx, %r12d
+; AVX512BW-NEXT:    movl %r15d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movl %eax, %ebx
+; AVX512BW-NEXT:    shll $25, %ebx
+; AVX512BW-NEXT:    orl %r12d, %ebx
+; AVX512BW-NEXT:    movl %eax, %r12d
+; AVX512BW-NEXT:    shll $26, %r12d
+; AVX512BW-NEXT:    orl %ebx, %r12d
+; AVX512BW-NEXT:    movl %eax, %ebx
+; AVX512BW-NEXT:    shll $27, %ebx
+; AVX512BW-NEXT:    orl %r12d, %ebx
+; AVX512BW-NEXT:    movl %eax, %r12d
+; AVX512BW-NEXT:    shll $28, %r12d
+; AVX512BW-NEXT:    orl %ebx, %r12d
+; AVX512BW-NEXT:    movl %eax, %r13d
+; AVX512BW-NEXT:    shll $29, %r13d
+; AVX512BW-NEXT:    orl %r12d, %r13d
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrd $1, %k0, %k1
+; AVX512BW-NEXT:    shll $30, %eax
+; AVX512BW-NEXT:    orl %r13d, %eax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $2, %k0, %k1
+; AVX512BW-NEXT:    shll $31, %r15d
+; AVX512BW-NEXT:    orl %eax, %r15d
+; AVX512BW-NEXT:    orl %r14d, %r15d
+; AVX512BW-NEXT:    movl %r15d, 24(%rsi)
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movzbl %al, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r14
+; AVX512BW-NEXT:    leaq (%r14,%rax,4), %r14
+; AVX512BW-NEXT:    leaq (%r14,%rax,8), %r14
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    shlq $4, %r15
+; AVX512BW-NEXT:    orq %r14, %r15
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    shlq $5, %r14
+; AVX512BW-NEXT:    orq %r15, %r14
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movzbl %r12b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $11, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $3, %k0, %k1
+; AVX512BW-NEXT:    shlq $13, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movzbl %r12b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $15, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $17, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $19, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrd $9, %k0, %k1
+; AVX512BW-NEXT:    shlq $20, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %al, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $21, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $22, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $23, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $24, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $25, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $26, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    shlq $27, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %bl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $28, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    shlq $29, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $30, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrd $5, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %r12
+; AVX512BW-NEXT:    shlq $32, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $6, %k0, %k1
+; AVX512BW-NEXT:    shlq $34, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %rbx
+; AVX512BW-NEXT:    shlq $36, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $37, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %rbx
+; AVX512BW-NEXT:    shlq $38, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $39, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $40, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrd $7, %k0, %k1
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbx, %r12
+; AVX512BW-NEXT:    shlq $43, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbx, %r12
+; AVX512BW-NEXT:    shlq $45, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $47, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $8, %k0, %k1
+; AVX512BW-NEXT:    shlq $48, %rbx
+; AVX512BW-NEXT:    orq %r13, %rbx
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $49, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %rbx
+; AVX512BW-NEXT:    shlq $50, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $51, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %rbx
+; AVX512BW-NEXT:    shlq $52, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $53, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r12, %rbx
+; AVX512BW-NEXT:    shlq $54, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrd $20, %k0, %k1
+; AVX512BW-NEXT:    shlq $55, %r12
+; AVX512BW-NEXT:    orq %rbx, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $57, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $59, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $61, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    shlq $62, %r13
+; AVX512BW-NEXT:    orq %rbx, %r13
+; AVX512BW-NEXT:    movzbl %r15b, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $63, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %rax, (%rsi)
+; AVX512BW-NEXT:    movzbl %cl, %r14d
+; AVX512BW-NEXT:    movl %r14d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r15
+; AVX512BW-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512BW-NEXT:    leaq (%r15,%rax,8), %r15
+; AVX512BW-NEXT:    shlq $4, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $5, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrd $21, %k0, %k1
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $13, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $15, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $17, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrd $23, %k0, %k1
+; AVX512BW-NEXT:    shlq $18, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $19, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $20, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $21, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $22, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $23, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $24, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    shlq $25, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movzbl %r11b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $27, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $28, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $29, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrd $24, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrd $25, %k0, %k1
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    shlq $32, %r11
+; AVX512BW-NEXT:    orq %r12, %r11
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r13, %r11
+; AVX512BW-NEXT:    shlq $34, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r13, %r11
+; AVX512BW-NEXT:    shlq $36, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $37, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $38, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrd $26, %k0, %k1
+; AVX512BW-NEXT:    shlq $39, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbp, %r12
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbp, %r12
+; AVX512BW-NEXT:    shlq $43, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbp, %r13
+; AVX512BW-NEXT:    shlq $45, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $14, %k0, %k1
+; AVX512BW-NEXT:    shlq $46, %rbp
+; AVX512BW-NEXT:    orq %r13, %rbp
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $47, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %r11, %r13
+; AVX512BW-NEXT:    shlq $48, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $49, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r11, %r13
+; AVX512BW-NEXT:    shlq $50, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $51, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r11, %r13
+; AVX512BW-NEXT:    shlq $52, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrd $15, %k0, %k1
+; AVX512BW-NEXT:    shlq $53, %r11
+; AVX512BW-NEXT:    orq %r13, %r11
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r12, %r11
+; AVX512BW-NEXT:    shlq $55, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r12, %r11
+; AVX512BW-NEXT:    shlq $57, %r11
+; AVX512BW-NEXT:    orq %rax, %r11
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $59, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrd $16, %k0, %k1
+; AVX512BW-NEXT:    shlq $60, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r10, %r13
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrd $17, %k0, %k0
+; AVX512BW-NEXT:    shlq $62, %r10
+; AVX512BW-NEXT:    orq %r13, %r10
+; AVX512BW-NEXT:    kmovd %k0, %r13d
+; AVX512BW-NEXT:    movzbl %r9b, %eax
+; AVX512BW-NEXT:    shlq $63, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %rax, 16(%rsi)
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    leaq (%rbx,%rbx,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%rbx,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%rbx,8), %rax
+; AVX512BW-NEXT:    movq %rbx, %r9
+; AVX512BW-NEXT:    shlq $4, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    shlq $5, %rbx
+; AVX512BW-NEXT:    orq %r9, %rbx
+; AVX512BW-NEXT:    movzbl %r8b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $6, %r8
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $7, %r9
+; AVX512BW-NEXT:    orq %r8, %r9
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $8, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $9, %r9
+; AVX512BW-NEXT:    orq %r8, %r9
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $10, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $11, %r9
+; AVX512BW-NEXT:    orq %r8, %r9
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movzbl %dil, %edi
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $13, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $15, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %rdi, %r8
+; AVX512BW-NEXT:    shlq $17, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    shlq $19, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movzbl %dl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $20, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $21, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $22, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $23, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $24, %rdx
+; AVX512BW-NEXT:    orq %rdi, %rdx
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $25, %rdi
+; AVX512BW-NEXT:    orq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $27, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $28, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $29, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $31, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    shlq $33, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rdx
+; AVX512BW-NEXT:    shlq $35, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rdx
+; AVX512BW-NEXT:    shlq $37, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rdx
+; AVX512BW-NEXT:    shlq $39, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $40, %rbp
+; AVX512BW-NEXT:    orq %rdx, %rbp
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $41, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %r11, %rdx
+; AVX512BW-NEXT:    shlq $42, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $43, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r11, %rdx
+; AVX512BW-NEXT:    shlq $44, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $45, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r11, %rdx
+; AVX512BW-NEXT:    shlq $46, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $47, %r11
+; AVX512BW-NEXT:    orq %rdx, %r11
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r11, %rax
+; AVX512BW-NEXT:    movq %r12, %rdx
+; AVX512BW-NEXT:    shlq $49, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r12, %rdx
+; AVX512BW-NEXT:    shlq $51, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r12, %rdx
+; AVX512BW-NEXT:    shlq $53, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $54, %r12
+; AVX512BW-NEXT:    orq %rdx, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $55, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rdx
+; AVX512BW-NEXT:    shlq $56, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $57, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r13, %rdx
+; AVX512BW-NEXT:    shlq $58, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $59, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %r13, %rdx
+; AVX512BW-NEXT:    shlq $60, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %rdx, %r13
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    shlq $62, %rcx
+; AVX512BW-NEXT:    orq %r13, %rcx
+; AVX512BW-NEXT:    shlq $63, %r14
+; AVX512BW-NEXT:    orq %rcx, %r14
+; AVX512BW-NEXT:    orq %rbx, %r14
+; AVX512BW-NEXT:    movq %r14, 8(%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r13
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <32 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  store <224 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor7_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    pushq %rbp
+; AVX512F-ONLY-NEXT:    pushq %r15
+; AVX512F-ONLY-NEXT:    pushq %r14
+; AVX512F-ONLY-NEXT:    pushq %r13
+; AVX512F-ONLY-NEXT:    pushq %r12
+; AVX512F-ONLY-NEXT:    pushq %rbx
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k0
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k2
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k3
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ecx
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %edx
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %edi
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k2, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebx
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k2, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r11d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k1, %k4
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r14d
+; AVX512F-ONLY-NEXT:    movzbl %r14b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    leaq (%r12,%r12,2), %r14
+; AVX512F-ONLY-NEXT:    leaq (%r14,%r12,4), %r14
+; AVX512F-ONLY-NEXT:    leaq (%r14,%r12,8), %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $4, %r15
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $5, %r14
+; AVX512F-ONLY-NEXT:    orq %r15, %r14
+; AVX512F-ONLY-NEXT:    shlq $6, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $7, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $8, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $9, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $10, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $11, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $12, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $13, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $15, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $17, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $19, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $20, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $21, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $22, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $24, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $25, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $26, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $27, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $28, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $29, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $30, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %r12
+; AVX512F-ONLY-NEXT:    shlq $32, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $34, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r8
+; AVX512F-ONLY-NEXT:    shlq $36, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $37, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r8
+; AVX512F-ONLY-NEXT:    shlq $38, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $39, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $40, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %r12
+; AVX512F-ONLY-NEXT:    shlq $43, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %r12
+; AVX512F-ONLY-NEXT:    shlq $45, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r8, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %r13
+; AVX512F-ONLY-NEXT:    shlq $47, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $48, %r8
+; AVX512F-ONLY-NEXT:    orq %r13, %r8
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r8
+; AVX512F-ONLY-NEXT:    shlq $50, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $51, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r8
+; AVX512F-ONLY-NEXT:    shlq $52, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $53, %rax
+; AVX512F-ONLY-NEXT:    orq %r8, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r8
+; AVX512F-ONLY-NEXT:    shlq $54, %r8
+; AVX512F-ONLY-NEXT:    orq %rax, %r8
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k1, %k4
+; AVX512F-ONLY-NEXT:    shlq $55, %r12
+; AVX512F-ONLY-NEXT:    orq %r8, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $56, %r8
+; AVX512F-ONLY-NEXT:    orq %r12, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $57, %r12
+; AVX512F-ONLY-NEXT:    orq %r8, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $58, %r8
+; AVX512F-ONLY-NEXT:    orq %r12, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $59, %r12
+; AVX512F-ONLY-NEXT:    orq %r8, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r8
+; AVX512F-ONLY-NEXT:    shlq $60, %r8
+; AVX512F-ONLY-NEXT:    orq %r12, %r8
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %r8, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r8d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $62, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r8b, %r8d
+; AVX512F-ONLY-NEXT:    movq %r8, %r12
+; AVX512F-ONLY-NEXT:    shlq $63, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    orq %r14, %r12
+; AVX512F-ONLY-NEXT:    movq %r12, (%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movzbl %r11b, %r14d
+; AVX512F-ONLY-NEXT:    movl %r14d, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,2), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,8), %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $5, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $6, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $8, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $9, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $10, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $11, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $12, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $13, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $14, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $15, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $16, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $17, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $18, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $19, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $20, %r10
+; AVX512F-ONLY-NEXT:    orq %rax, %r10
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $21, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r10d
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $22, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $23, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $24, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $25, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %rax
+; AVX512F-ONLY-NEXT:    shlq $26, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $27, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $28, %r10
+; AVX512F-ONLY-NEXT:    orq %r12, %r10
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $29, %r12
+; AVX512F-ONLY-NEXT:    orq %r10, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $30, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r10d
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %r12
+; AVX512F-ONLY-NEXT:    shlq $32, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r9, %r13
+; AVX512F-ONLY-NEXT:    shlq $34, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $35, %r9
+; AVX512F-ONLY-NEXT:    orq %r13, %r9
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r9
+; AVX512F-ONLY-NEXT:    shlq $37, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r9
+; AVX512F-ONLY-NEXT:    shlq $39, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r9
+; AVX512F-ONLY-NEXT:    shlq $41, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $42, %r12
+; AVX512F-ONLY-NEXT:    orq %r9, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $43, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r9
+; AVX512F-ONLY-NEXT:    shlq $44, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $45, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r9
+; AVX512F-ONLY-NEXT:    shlq $46, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $48, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $49, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r9, %r12
+; AVX512F-ONLY-NEXT:    shlq $51, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r9, %r12
+; AVX512F-ONLY-NEXT:    shlq $53, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r9, %r12
+; AVX512F-ONLY-NEXT:    shlq $55, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $56, %r9
+; AVX512F-ONLY-NEXT:    orq %r12, %r9
+; AVX512F-ONLY-NEXT:    movzbl %r10b, %eax
+; AVX512F-ONLY-NEXT:    # kill: def $r10d killed $r10d def $r10
+; AVX512F-ONLY-NEXT:    andl $1, %r10d
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $57, %r12
+; AVX512F-ONLY-NEXT:    orq %r9, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %r9
+; AVX512F-ONLY-NEXT:    shlq $58, %r9
+; AVX512F-ONLY-NEXT:    orq %r12, %r9
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $59, %r12
+; AVX512F-ONLY-NEXT:    orq %r9, %r12
+; AVX512F-ONLY-NEXT:    movq %r10, %r9
+; AVX512F-ONLY-NEXT:    shlq $60, %r9
+; AVX512F-ONLY-NEXT:    orq %r12, %r9
+; AVX512F-ONLY-NEXT:    movq %r10, %r12
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %r9, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r9d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $62, %r10
+; AVX512F-ONLY-NEXT:    orq %r12, %r10
+; AVX512F-ONLY-NEXT:    shlq $63, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %rax, 48(%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %r9b, %r10d
+; AVX512F-ONLY-NEXT:    movl %r10d, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r15
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%r15,%rax,8), %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $4, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $5, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $6, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $7, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $8, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $9, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $10, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $11, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $12, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $13, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $14, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $15, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $16, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $17, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $18, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $19, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $20, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $21, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $22, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $23, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    movzbl %al, %ebp
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $24, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %r15
+; AVX512F-ONLY-NEXT:    shlq $25, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $26, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %r15
+; AVX512F-ONLY-NEXT:    shlq $27, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $28, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %r13
+; AVX512F-ONLY-NEXT:    shlq $29, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $30, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %rbp, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $32, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $34, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rbp
+; AVX512F-ONLY-NEXT:    shlq $36, %rbp
+; AVX512F-ONLY-NEXT:    orq %rax, %rbp
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r13d
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $37, %r15
+; AVX512F-ONLY-NEXT:    orq %rbp, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $39, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $41, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r15
+; AVX512F-ONLY-NEXT:    shlq $43, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $44, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $45, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %r15
+; AVX512F-ONLY-NEXT:    shlq $46, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %r15
+; AVX512F-ONLY-NEXT:    shlq $48, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %r13
+; AVX512F-ONLY-NEXT:    shlq $50, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $6, %k0, %k4
+; AVX512F-ONLY-NEXT:    shlq $51, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $52, %rax
+; AVX512F-ONLY-NEXT:    orq %rbp, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $53, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $55, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $57, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $58, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    movzbl %bl, %r13d
+; AVX512F-ONLY-NEXT:    movl %ebx, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $59, %rbx
+; AVX512F-ONLY-NEXT:    orq %r15, %rbx
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $60, %r15
+; AVX512F-ONLY-NEXT:    orq %rbx, %r15
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $61, %rbx
+; AVX512F-ONLY-NEXT:    orq %r15, %rbx
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k2, %k4
+; AVX512F-ONLY-NEXT:    shlq $62, %rax
+; AVX512F-ONLY-NEXT:    orq %rbx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rbx
+; AVX512F-ONLY-NEXT:    shlq $63, %rbx
+; AVX512F-ONLY-NEXT:    orq %rax, %rbx
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k2, %k4
+; AVX512F-ONLY-NEXT:    orq %r12, %rbx
+; AVX512F-ONLY-NEXT:    movq %rbx, 32(%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %al, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    leaq (%r13,%r13,2), %rbx
+; AVX512F-ONLY-NEXT:    leaq (%rbx,%rax,4), %rbx
+; AVX512F-ONLY-NEXT:    leaq (%rbx,%rax,8), %rbx
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %rbx, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %rbx
+; AVX512F-ONLY-NEXT:    shlq $5, %rbx
+; AVX512F-ONLY-NEXT:    orq %r12, %rbx
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $6, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $7, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $9, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $11, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $13, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $15, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $17, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $19, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $20, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $21, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $22, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $24, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $25, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $26, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $27, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $28, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $29, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $30, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $31, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $33, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $35, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k4, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k3, %k4
+; AVX512F-ONLY-NEXT:    shlq $36, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %ebp
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $37, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %r12
+; AVX512F-ONLY-NEXT:    shlq $38, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $39, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %r12
+; AVX512F-ONLY-NEXT:    shlq $40, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %rbp, %rax
+; AVX512F-ONLY-NEXT:    shlq $41, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %rbp, %r13
+; AVX512F-ONLY-NEXT:    shlq $42, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k4, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k3, %k3
+; AVX512F-ONLY-NEXT:    shlq $43, %rbp
+; AVX512F-ONLY-NEXT:    orq %r13, %rbp
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %rbp, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $45, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $47, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $49, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $50, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $51, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $52, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $53, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $54, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $56, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $57, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    andl $1, %r11d
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $58, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %r12
+; AVX512F-ONLY-NEXT:    shlq $59, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r11, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r11, %r12
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $4, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $62, %r11
+; AVX512F-ONLY-NEXT:    orq %r12, %r11
+; AVX512F-ONLY-NEXT:    shlq $63, %r14
+; AVX512F-ONLY-NEXT:    orq %r11, %r14
+; AVX512F-ONLY-NEXT:    orq %rbx, %r14
+; AVX512F-ONLY-NEXT:    movq %r14, 40(%rsi)
+; AVX512F-ONLY-NEXT:    movzbl %al, %r11d
+; AVX512F-ONLY-NEXT:    movl %r11d, %ebx
+; AVX512F-ONLY-NEXT:    andl $1, %ebx
+; AVX512F-ONLY-NEXT:    leaq (%rbx,%rbx,2), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,4), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rbx,8), %rax
+; AVX512F-ONLY-NEXT:    movq %rbx, %r12
+; AVX512F-ONLY-NEXT:    shlq $4, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %r14
+; AVX512F-ONLY-NEXT:    shlq $5, %r14
+; AVX512F-ONLY-NEXT:    orq %r12, %r14
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $7, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $9, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $10, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $5, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $11, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $13, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $15, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r13, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $17, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $11, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $18, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r13d
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $19, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $20, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $21, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $22, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $23, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %r12
+; AVX512F-ONLY-NEXT:    shlq $24, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $25, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movzbl %r15b, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $26, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $27, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $28, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r13
+; AVX512F-ONLY-NEXT:    shlq $29, %r13
+; AVX512F-ONLY-NEXT:    orq %r12, %r13
+; AVX512F-ONLY-NEXT:    movq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $30, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $7, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $31, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $8, %k0, %k3
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    shlq $32, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $34, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $35, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $36, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $37, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $38, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $9, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $39, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $40, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $41, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $42, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $43, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $44, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r13
+; AVX512F-ONLY-NEXT:    shlq $45, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $46, %r15
+; AVX512F-ONLY-NEXT:    orq %r13, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $48, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $50, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $51, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $52, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $10, %k1, %k3
+; AVX512F-ONLY-NEXT:    shlq $53, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $55, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $56, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $57, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $58, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $59, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $60, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    movl %ebp, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $61, %r13
+; AVX512F-ONLY-NEXT:    orq %r15, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %ebp
+; AVX512F-ONLY-NEXT:    kshiftrw $12, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $62, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    movq %rax, %r15
+; AVX512F-ONLY-NEXT:    shlq $63, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r12d
+; AVX512F-ONLY-NEXT:    kshiftrw $13, %k0, %k3
+; AVX512F-ONLY-NEXT:    orq %r14, %r15
+; AVX512F-ONLY-NEXT:    movq %r15, 16(%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%rax,2), %r14
+; AVX512F-ONLY-NEXT:    leaq (%r14,%rax,4), %r14
+; AVX512F-ONLY-NEXT:    leaq (%r14,%rax,8), %rax
+; AVX512F-ONLY-NEXT:    movzbl %r12b, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $4, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %r14
+; AVX512F-ONLY-NEXT:    shlq $5, %r14
+; AVX512F-ONLY-NEXT:    orq %r15, %r14
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $6, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $7, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $8, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $9, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $10, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $11, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $12, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $13, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $14, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $15, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $16, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $15, %k0, %k3
+; AVX512F-ONLY-NEXT:    shlq $17, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $19, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $20, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $21, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $22, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $23, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $24, %r12
+; AVX512F-ONLY-NEXT:    orq %r15, %r12
+; AVX512F-ONLY-NEXT:    movzbl %al, %r15d
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $25, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $26, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $27, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $28, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $29, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $30, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    shlq $31, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $33, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $35, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $37, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k3, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $2, %k2, %k3
+; AVX512F-ONLY-NEXT:    shlq $38, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $39, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $40, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $41, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $42, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $43, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $44, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k3, %eax
+; AVX512F-ONLY-NEXT:    kshiftrw $3, %k2, %k2
+; AVX512F-ONLY-NEXT:    shlq $45, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    movzbl %al, %r12d
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $46, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $47, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $48, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r15
+; AVX512F-ONLY-NEXT:    shlq $49, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $50, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %r13
+; AVX512F-ONLY-NEXT:    shlq $51, %r13
+; AVX512F-ONLY-NEXT:    orq %rax, %r13
+; AVX512F-ONLY-NEXT:    kmovw %k2, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512F-ONLY-NEXT:    shlq $52, %r12
+; AVX512F-ONLY-NEXT:    orq %r13, %r12
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $53, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $54, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $56, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %r12, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %r12
+; AVX512F-ONLY-NEXT:    shlq $58, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k1, %r13d
+; AVX512F-ONLY-NEXT:    shlq $59, %r15
+; AVX512F-ONLY-NEXT:    orq %r12, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r9d
+; AVX512F-ONLY-NEXT:    movq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $60, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r9, %r12
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %rax, %r12
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r15d
+; AVX512F-ONLY-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512F-ONLY-NEXT:    shlq $62, %r9
+; AVX512F-ONLY-NEXT:    orq %r12, %r9
+; AVX512F-ONLY-NEXT:    kmovw %k0, %r12d
+; AVX512F-ONLY-NEXT:    shlq $63, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    orq %r14, %r10
+; AVX512F-ONLY-NEXT:    movq %r10, 24(%rsi)
+; AVX512F-ONLY-NEXT:    andl $1, %r8d
+; AVX512F-ONLY-NEXT:    leaq (%r8,%r8,2), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r8,4), %rax
+; AVX512F-ONLY-NEXT:    leaq (%rax,%r8,8), %rax
+; AVX512F-ONLY-NEXT:    movq %r8, %r9
+; AVX512F-ONLY-NEXT:    shlq $4, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $5, %r8
+; AVX512F-ONLY-NEXT:    orq %r9, %r8
+; AVX512F-ONLY-NEXT:    movzbl %bpl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $6, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $7, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $8, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $9, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    movq %rax, %r9
+; AVX512F-ONLY-NEXT:    shlq $10, %r9
+; AVX512F-ONLY-NEXT:    orq %r10, %r9
+; AVX512F-ONLY-NEXT:    movq %rax, %r10
+; AVX512F-ONLY-NEXT:    shlq $11, %r10
+; AVX512F-ONLY-NEXT:    orq %r9, %r10
+; AVX512F-ONLY-NEXT:    shlq $12, %rax
+; AVX512F-ONLY-NEXT:    orq %r10, %rax
+; AVX512F-ONLY-NEXT:    movzbl %dil, %edi
+; AVX512F-ONLY-NEXT:    andl $1, %edi
+; AVX512F-ONLY-NEXT:    movq %rdi, %r9
+; AVX512F-ONLY-NEXT:    shlq $13, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $14, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %r9
+; AVX512F-ONLY-NEXT:    shlq $15, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $16, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    movq %rdi, %r9
+; AVX512F-ONLY-NEXT:    shlq $17, %r9
+; AVX512F-ONLY-NEXT:    orq %rax, %r9
+; AVX512F-ONLY-NEXT:    movq %rdi, %rax
+; AVX512F-ONLY-NEXT:    shlq $18, %rax
+; AVX512F-ONLY-NEXT:    orq %r9, %rax
+; AVX512F-ONLY-NEXT:    shlq $19, %rdi
+; AVX512F-ONLY-NEXT:    orq %rax, %rdi
+; AVX512F-ONLY-NEXT:    movzbl %dl, %eax
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $20, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $21, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $22, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $23, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $24, %rdx
+; AVX512F-ONLY-NEXT:    orq %rdi, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdi
+; AVX512F-ONLY-NEXT:    shlq $25, %rdi
+; AVX512F-ONLY-NEXT:    orq %rdx, %rdi
+; AVX512F-ONLY-NEXT:    shlq $26, %rax
+; AVX512F-ONLY-NEXT:    orq %rdi, %rax
+; AVX512F-ONLY-NEXT:    movzbl %cl, %ecx
+; AVX512F-ONLY-NEXT:    andl $1, %ecx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $27, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $28, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $29, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $30, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    movq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $31, %rdx
+; AVX512F-ONLY-NEXT:    orq %rax, %rdx
+; AVX512F-ONLY-NEXT:    movq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $32, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    shlq $33, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    andl $1, %r13d
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $34, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $35, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $36, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $37, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r13, %rax
+; AVX512F-ONLY-NEXT:    shlq $38, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r13, %rcx
+; AVX512F-ONLY-NEXT:    shlq $39, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $40, %r13
+; AVX512F-ONLY-NEXT:    orq %rcx, %r13
+; AVX512F-ONLY-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512F-ONLY-NEXT:    andl $1, %eax
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $41, %rcx
+; AVX512F-ONLY-NEXT:    orq %r13, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $42, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $43, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $44, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    movq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $45, %rcx
+; AVX512F-ONLY-NEXT:    orq %rdx, %rcx
+; AVX512F-ONLY-NEXT:    movq %rax, %rdx
+; AVX512F-ONLY-NEXT:    shlq $46, %rdx
+; AVX512F-ONLY-NEXT:    orq %rcx, %rdx
+; AVX512F-ONLY-NEXT:    shlq $47, %rax
+; AVX512F-ONLY-NEXT:    orq %rdx, %rax
+; AVX512F-ONLY-NEXT:    andl $1, %r15d
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $48, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $49, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $50, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $51, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r15, %rcx
+; AVX512F-ONLY-NEXT:    shlq $52, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r15, %rax
+; AVX512F-ONLY-NEXT:    shlq $53, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    shlq $54, %r15
+; AVX512F-ONLY-NEXT:    orq %rax, %r15
+; AVX512F-ONLY-NEXT:    andl $1, %r12d
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $55, %rax
+; AVX512F-ONLY-NEXT:    orq %r15, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $56, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $57, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $58, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    movq %r12, %rax
+; AVX512F-ONLY-NEXT:    shlq $59, %rax
+; AVX512F-ONLY-NEXT:    orq %rcx, %rax
+; AVX512F-ONLY-NEXT:    movq %r12, %rcx
+; AVX512F-ONLY-NEXT:    shlq $60, %rcx
+; AVX512F-ONLY-NEXT:    orq %rax, %rcx
+; AVX512F-ONLY-NEXT:    shlq $61, %r12
+; AVX512F-ONLY-NEXT:    orq %rcx, %r12
+; AVX512F-ONLY-NEXT:    shlq $62, %rbx
+; AVX512F-ONLY-NEXT:    orq %r12, %rbx
+; AVX512F-ONLY-NEXT:    shlq $63, %r11
+; AVX512F-ONLY-NEXT:    orq %rbx, %r11
+; AVX512F-ONLY-NEXT:    orq %r8, %r11
+; AVX512F-ONLY-NEXT:    movq %r11, 8(%rsi)
+; AVX512F-ONLY-NEXT:    popq %rbx
+; AVX512F-ONLY-NEXT:    popq %r12
+; AVX512F-ONLY-NEXT:    popq %r13
+; AVX512F-ONLY-NEXT:    popq %r14
+; AVX512F-ONLY-NEXT:    popq %r15
+; AVX512F-ONLY-NEXT:    popq %rbp
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor7_vf64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:    kmovw (%rdi), %k1
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k2
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k3
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512DQ-NEXT:    kshiftrw $13, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ecx
+; AVX512DQ-NEXT:    kshiftrw $12, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %edx
+; AVX512DQ-NEXT:    kshiftrw $11, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %edi
+; AVX512DQ-NEXT:    kshiftrw $13, %k2, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ebx
+; AVX512DQ-NEXT:    kshiftrw $5, %k2, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $11, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r9d
+; AVX512DQ-NEXT:    kshiftrw $8, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r10d
+; AVX512DQ-NEXT:    kshiftrw $6, %k3, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r11d
+; AVX512DQ-NEXT:    kshiftrw $4, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %r8d
+; AVX512DQ-NEXT:    kshiftrw $1, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $2, %k1, %k4
+; AVX512DQ-NEXT:    kmovw %k1, %r14d
+; AVX512DQ-NEXT:    movzbl %r14b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    leaq (%r12,%r12,2), %r14
+; AVX512DQ-NEXT:    leaq (%r14,%r12,4), %r14
+; AVX512DQ-NEXT:    leaq (%r14,%r12,8), %r14
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $4, %r15
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $5, %r14
+; AVX512DQ-NEXT:    orq %r15, %r14
+; AVX512DQ-NEXT:    shlq $6, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $7, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $8, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $9, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $10, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $11, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $12, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $3, %k1, %k4
+; AVX512DQ-NEXT:    shlq $13, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $15, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $17, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $19, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $7, %k3, %k4
+; AVX512DQ-NEXT:    shlq $20, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $21, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $22, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $24, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $25, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $26, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $27, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %r8b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $28, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $29, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $30, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $5, %k1, %k4
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %r12
+; AVX512DQ-NEXT:    shlq $32, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $6, %k1, %k4
+; AVX512DQ-NEXT:    shlq $34, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %r12, %r8
+; AVX512DQ-NEXT:    shlq $36, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $37, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %r12, %r8
+; AVX512DQ-NEXT:    shlq $38, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $39, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $40, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r8d
+; AVX512DQ-NEXT:    kshiftrw $7, %k1, %k4
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r8, %r12
+; AVX512DQ-NEXT:    shlq $43, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r8, %r12
+; AVX512DQ-NEXT:    shlq $45, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r8, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r8, %r13
+; AVX512DQ-NEXT:    shlq $47, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $8, %k1, %k4
+; AVX512DQ-NEXT:    shlq $48, %r8
+; AVX512DQ-NEXT:    orq %r13, %r8
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $49, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %r12, %r8
+; AVX512DQ-NEXT:    shlq $50, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $51, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %r12, %r8
+; AVX512DQ-NEXT:    shlq $52, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $53, %rax
+; AVX512DQ-NEXT:    orq %r8, %rax
+; AVX512DQ-NEXT:    movq %r12, %r8
+; AVX512DQ-NEXT:    shlq $54, %r8
+; AVX512DQ-NEXT:    orq %rax, %r8
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $9, %k1, %k4
+; AVX512DQ-NEXT:    shlq $55, %r12
+; AVX512DQ-NEXT:    orq %r8, %r12
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $56, %r8
+; AVX512DQ-NEXT:    orq %r12, %r8
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $57, %r12
+; AVX512DQ-NEXT:    orq %r8, %r12
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $58, %r8
+; AVX512DQ-NEXT:    orq %r12, %r8
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $59, %r12
+; AVX512DQ-NEXT:    orq %r8, %r12
+; AVX512DQ-NEXT:    movq %rax, %r8
+; AVX512DQ-NEXT:    shlq $60, %r8
+; AVX512DQ-NEXT:    orq %r12, %r8
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %r8, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r8d
+; AVX512DQ-NEXT:    kshiftrw $9, %k3, %k4
+; AVX512DQ-NEXT:    shlq $62, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %r8b, %r8d
+; AVX512DQ-NEXT:    movq %r8, %r12
+; AVX512DQ-NEXT:    shlq $63, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    orq %r14, %r12
+; AVX512DQ-NEXT:    movq %r12, (%rsi)
+; AVX512DQ-NEXT:    movzbl %r15b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movzbl %r11b, %r14d
+; AVX512DQ-NEXT:    movl %r14d, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    leaq (%r15,%rax,2), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%rax,8), %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $5, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $6, %r12
+; AVX512DQ-NEXT:    shlq $7, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movzbl %r10b, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $8, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $9, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $10, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $11, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $12, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $13, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $10, %k3, %k4
+; AVX512DQ-NEXT:    shlq $14, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movzbl %r12b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $15, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $16, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $17, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $18, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $19, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movq %r12, %r10
+; AVX512DQ-NEXT:    shlq $20, %r10
+; AVX512DQ-NEXT:    orq %rax, %r10
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $15, %k3, %k4
+; AVX512DQ-NEXT:    shlq $21, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r10d
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $22, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $23, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $24, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $25, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r10, %rax
+; AVX512DQ-NEXT:    shlq $26, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $27, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $28, %r10
+; AVX512DQ-NEXT:    orq %r12, %r10
+; AVX512DQ-NEXT:    movzbl %r9b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $29, %r12
+; AVX512DQ-NEXT:    orq %r10, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $30, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r10d
+; AVX512DQ-NEXT:    kshiftrw $12, %k3, %k4
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %r12
+; AVX512DQ-NEXT:    shlq $32, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r9, %r13
+; AVX512DQ-NEXT:    shlq $34, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $13, %k3, %k4
+; AVX512DQ-NEXT:    shlq $35, %r9
+; AVX512DQ-NEXT:    orq %r13, %r9
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %r12, %r9
+; AVX512DQ-NEXT:    shlq $37, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %r12, %r9
+; AVX512DQ-NEXT:    shlq $39, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %r12, %r9
+; AVX512DQ-NEXT:    shlq $41, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $14, %k3, %k4
+; AVX512DQ-NEXT:    shlq $42, %r12
+; AVX512DQ-NEXT:    orq %r9, %r12
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $43, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r9
+; AVX512DQ-NEXT:    shlq $44, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $45, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %r13, %r9
+; AVX512DQ-NEXT:    shlq $46, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $48, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r9d
+; AVX512DQ-NEXT:    kshiftrw $4, %k2, %k4
+; AVX512DQ-NEXT:    shlq $49, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r9, %r12
+; AVX512DQ-NEXT:    shlq $51, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r9, %r12
+; AVX512DQ-NEXT:    shlq $53, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r9, %r12
+; AVX512DQ-NEXT:    shlq $55, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $56, %r9
+; AVX512DQ-NEXT:    orq %r12, %r9
+; AVX512DQ-NEXT:    movzbl %r10b, %eax
+; AVX512DQ-NEXT:    # kill: def $r10d killed $r10d def $r10
+; AVX512DQ-NEXT:    andl $1, %r10d
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $57, %r12
+; AVX512DQ-NEXT:    orq %r9, %r12
+; AVX512DQ-NEXT:    movq %r10, %r9
+; AVX512DQ-NEXT:    shlq $58, %r9
+; AVX512DQ-NEXT:    orq %r12, %r9
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $59, %r12
+; AVX512DQ-NEXT:    orq %r9, %r12
+; AVX512DQ-NEXT:    movq %r10, %r9
+; AVX512DQ-NEXT:    shlq $60, %r9
+; AVX512DQ-NEXT:    orq %r12, %r9
+; AVX512DQ-NEXT:    movq %r10, %r12
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %r9, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %r9d
+; AVX512DQ-NEXT:    kshiftrw $6, %k2, %k4
+; AVX512DQ-NEXT:    shlq $62, %r10
+; AVX512DQ-NEXT:    orq %r12, %r10
+; AVX512DQ-NEXT:    shlq $63, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %rax, 48(%rsi)
+; AVX512DQ-NEXT:    movzbl %r9b, %r10d
+; AVX512DQ-NEXT:    movl %r10d, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r15
+; AVX512DQ-NEXT:    leaq (%r15,%rax,4), %r15
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%r15,%rax,8), %r15
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $4, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $5, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $6, %r15
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $7, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $8, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $7, %k2, %k4
+; AVX512DQ-NEXT:    shlq $9, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movzbl %bpl, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $10, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $11, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $12, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $13, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $14, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $15, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $8, %k2, %k4
+; AVX512DQ-NEXT:    shlq $16, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $17, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $18, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $19, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $20, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $21, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $22, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $9, %k2, %k4
+; AVX512DQ-NEXT:    shlq $23, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    movzbl %al, %ebp
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $24, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %rbp, %r15
+; AVX512DQ-NEXT:    shlq $25, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $26, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %rbp, %r15
+; AVX512DQ-NEXT:    shlq $27, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $28, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %rbp, %r13
+; AVX512DQ-NEXT:    shlq $29, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $10, %k2, %k4
+; AVX512DQ-NEXT:    shlq $30, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    movzbl %r15b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %rbp, %rax
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $32, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $34, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r15, %rbp
+; AVX512DQ-NEXT:    shlq $36, %rbp
+; AVX512DQ-NEXT:    orq %rax, %rbp
+; AVX512DQ-NEXT:    kmovw %k4, %r13d
+; AVX512DQ-NEXT:    kshiftrw $11, %k2, %k4
+; AVX512DQ-NEXT:    shlq $37, %r15
+; AVX512DQ-NEXT:    orq %rbp, %r15
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $39, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $41, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r13, %r15
+; AVX512DQ-NEXT:    shlq $43, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $12, %k2, %k4
+; AVX512DQ-NEXT:    shlq $44, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $45, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %rbp, %r15
+; AVX512DQ-NEXT:    shlq $46, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %rbp, %r15
+; AVX512DQ-NEXT:    shlq $48, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $49, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %rbp, %r13
+; AVX512DQ-NEXT:    shlq $50, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $6, %k0, %k4
+; AVX512DQ-NEXT:    shlq $51, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $52, %rax
+; AVX512DQ-NEXT:    orq %rbp, %rax
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $53, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $55, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $57, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    shlq $58, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    movzbl %bl, %r13d
+; AVX512DQ-NEXT:    movl %ebx, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $59, %rbx
+; AVX512DQ-NEXT:    orq %r15, %rbx
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $60, %r15
+; AVX512DQ-NEXT:    orq %rbx, %r15
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $61, %rbx
+; AVX512DQ-NEXT:    orq %r15, %rbx
+; AVX512DQ-NEXT:    kmovw %k4, %r15d
+; AVX512DQ-NEXT:    kshiftrw $14, %k2, %k4
+; AVX512DQ-NEXT:    shlq $62, %rax
+; AVX512DQ-NEXT:    orq %rbx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rbx
+; AVX512DQ-NEXT:    shlq $63, %rbx
+; AVX512DQ-NEXT:    orq %rax, %rbx
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k4
+; AVX512DQ-NEXT:    orq %r12, %rbx
+; AVX512DQ-NEXT:    movq %rbx, 32(%rsi)
+; AVX512DQ-NEXT:    movzbl %al, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    leaq (%r13,%r13,2), %rbx
+; AVX512DQ-NEXT:    leaq (%rbx,%rax,4), %rbx
+; AVX512DQ-NEXT:    leaq (%rbx,%rax,8), %rbx
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %rbx, %r12
+; AVX512DQ-NEXT:    movq %rax, %rbx
+; AVX512DQ-NEXT:    shlq $5, %rbx
+; AVX512DQ-NEXT:    orq %r12, %rbx
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $6, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $7, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $1, %k3, %k4
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $9, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $11, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $13, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    shlq $15, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $17, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $19, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $20, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $21, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $2, %k3, %k4
+; AVX512DQ-NEXT:    shlq $22, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $24, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $25, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $26, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $27, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $28, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %eax
+; AVX512DQ-NEXT:    kshiftrw $3, %k3, %k4
+; AVX512DQ-NEXT:    shlq $29, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $30, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $31, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $33, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $35, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k4, %ebp
+; AVX512DQ-NEXT:    kshiftrw $4, %k3, %k4
+; AVX512DQ-NEXT:    shlq $36, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    andl $1, %ebp
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $37, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %rbp, %r12
+; AVX512DQ-NEXT:    shlq $38, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $39, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %rbp, %r12
+; AVX512DQ-NEXT:    shlq $40, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %rbp, %rax
+; AVX512DQ-NEXT:    shlq $41, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %rbp, %r13
+; AVX512DQ-NEXT:    shlq $42, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k4, %r12d
+; AVX512DQ-NEXT:    kshiftrw $5, %k3, %k3
+; AVX512DQ-NEXT:    shlq $43, %rbp
+; AVX512DQ-NEXT:    orq %r13, %rbp
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %rbp, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $45, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $47, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $49, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $3, %k0, %k3
+; AVX512DQ-NEXT:    shlq $50, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $51, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $52, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $53, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $54, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $56, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $2, %k0, %k3
+; AVX512DQ-NEXT:    shlq $57, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    andl $1, %r11d
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $58, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r11, %r12
+; AVX512DQ-NEXT:    shlq $59, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r11, %rax
+; AVX512DQ-NEXT:    shlq $60, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r11, %r12
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $4, %k0, %k3
+; AVX512DQ-NEXT:    shlq $62, %r11
+; AVX512DQ-NEXT:    orq %r12, %r11
+; AVX512DQ-NEXT:    shlq $63, %r14
+; AVX512DQ-NEXT:    orq %r11, %r14
+; AVX512DQ-NEXT:    orq %rbx, %r14
+; AVX512DQ-NEXT:    movq %r14, 40(%rsi)
+; AVX512DQ-NEXT:    movzbl %al, %r11d
+; AVX512DQ-NEXT:    movl %r11d, %ebx
+; AVX512DQ-NEXT:    andl $1, %ebx
+; AVX512DQ-NEXT:    leaq (%rbx,%rbx,2), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%rbx,4), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%rbx,8), %rax
+; AVX512DQ-NEXT:    movq %rbx, %r12
+; AVX512DQ-NEXT:    shlq $4, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movzbl %bpl, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %r14
+; AVX512DQ-NEXT:    shlq $5, %r14
+; AVX512DQ-NEXT:    orq %r12, %r14
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $7, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $9, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $10, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $5, %k0, %k3
+; AVX512DQ-NEXT:    shlq $11, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movzbl %bpl, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $13, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $15, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r13, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $17, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $11, %k0, %k3
+; AVX512DQ-NEXT:    shlq $18, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r13d
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $19, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $20, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $21, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $22, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $23, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r13, %r12
+; AVX512DQ-NEXT:    shlq $24, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $25, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movzbl %r15b, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $26, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $27, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $28, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %rax, %r13
+; AVX512DQ-NEXT:    shlq $29, %r13
+; AVX512DQ-NEXT:    orq %r12, %r13
+; AVX512DQ-NEXT:    movq %rax, %r12
+; AVX512DQ-NEXT:    shlq $30, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $7, %k0, %k3
+; AVX512DQ-NEXT:    shlq $31, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    kmovw %k3, %r12d
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k3
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    shlq $32, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $33, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $34, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $35, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $36, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $37, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $38, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %r15d
+; AVX512DQ-NEXT:    kshiftrw $9, %k0, %k3
+; AVX512DQ-NEXT:    shlq $39, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $40, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $41, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $42, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $43, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $44, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r13
+; AVX512DQ-NEXT:    shlq $45, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %r12d
+; AVX512DQ-NEXT:    kshiftrw $10, %k0, %k3
+; AVX512DQ-NEXT:    shlq $46, %r15
+; AVX512DQ-NEXT:    orq %r13, %r15
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $48, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $49, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $50, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $51, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $52, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %r15d
+; AVX512DQ-NEXT:    kshiftrw $10, %k1, %k3
+; AVX512DQ-NEXT:    shlq $53, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $54, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $55, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $56, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $57, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $58, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $59, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $60, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    movl %ebp, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $61, %r13
+; AVX512DQ-NEXT:    orq %r15, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %ebp
+; AVX512DQ-NEXT:    kshiftrw $12, %k0, %k3
+; AVX512DQ-NEXT:    shlq $62, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    movq %rax, %r15
+; AVX512DQ-NEXT:    shlq $63, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    kmovw %k3, %r12d
+; AVX512DQ-NEXT:    kshiftrw $13, %k0, %k3
+; AVX512DQ-NEXT:    orq %r14, %r15
+; AVX512DQ-NEXT:    movq %r15, 16(%rsi)
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    leaq (%rax,%rax,2), %r14
+; AVX512DQ-NEXT:    leaq (%r14,%rax,4), %r14
+; AVX512DQ-NEXT:    leaq (%r14,%rax,8), %rax
+; AVX512DQ-NEXT:    movzbl %r12b, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $4, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %r14
+; AVX512DQ-NEXT:    shlq $5, %r14
+; AVX512DQ-NEXT:    orq %r15, %r14
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $6, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $7, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $8, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $9, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $14, %k0, %k3
+; AVX512DQ-NEXT:    shlq $10, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $11, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $12, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $13, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $14, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $15, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $16, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k3
+; AVX512DQ-NEXT:    shlq $17, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $19, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $20, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $21, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $22, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $23, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $1, %k2, %k3
+; AVX512DQ-NEXT:    shlq $24, %r12
+; AVX512DQ-NEXT:    orq %r15, %r12
+; AVX512DQ-NEXT:    movzbl %al, %r15d
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $25, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $26, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $27, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $28, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $29, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $30, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    shlq $31, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    kmovw %k2, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $33, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $35, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $37, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k3, %r15d
+; AVX512DQ-NEXT:    kshiftrw $2, %k2, %k3
+; AVX512DQ-NEXT:    shlq $38, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $39, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $40, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $41, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $42, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $43, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $44, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k3, %eax
+; AVX512DQ-NEXT:    kshiftrw $3, %k2, %k2
+; AVX512DQ-NEXT:    shlq $45, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    movzbl %al, %r12d
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $46, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $47, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $48, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r15
+; AVX512DQ-NEXT:    shlq $49, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $50, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %r13
+; AVX512DQ-NEXT:    shlq $51, %r13
+; AVX512DQ-NEXT:    orq %rax, %r13
+; AVX512DQ-NEXT:    kmovw %k2, %r15d
+; AVX512DQ-NEXT:    kshiftrw $14, %k1, %k1
+; AVX512DQ-NEXT:    shlq $52, %r12
+; AVX512DQ-NEXT:    orq %r13, %r12
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $53, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $54, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $56, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %r12, %rax
+; AVX512DQ-NEXT:    movq %r15, %r12
+; AVX512DQ-NEXT:    shlq $58, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k1, %r13d
+; AVX512DQ-NEXT:    shlq $59, %r15
+; AVX512DQ-NEXT:    orq %r12, %r15
+; AVX512DQ-NEXT:    andl $1, %r9d
+; AVX512DQ-NEXT:    movq %r9, %rax
+; AVX512DQ-NEXT:    shlq $60, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r9, %r12
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %rax, %r12
+; AVX512DQ-NEXT:    kmovw %k0, %r15d
+; AVX512DQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512DQ-NEXT:    shlq $62, %r9
+; AVX512DQ-NEXT:    orq %r12, %r9
+; AVX512DQ-NEXT:    kmovw %k0, %r12d
+; AVX512DQ-NEXT:    shlq $63, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    orq %r14, %r10
+; AVX512DQ-NEXT:    movq %r10, 24(%rsi)
+; AVX512DQ-NEXT:    andl $1, %r8d
+; AVX512DQ-NEXT:    leaq (%r8,%r8,2), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%r8,4), %rax
+; AVX512DQ-NEXT:    leaq (%rax,%r8,8), %rax
+; AVX512DQ-NEXT:    movq %r8, %r9
+; AVX512DQ-NEXT:    shlq $4, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    shlq $5, %r8
+; AVX512DQ-NEXT:    orq %r9, %r8
+; AVX512DQ-NEXT:    movzbl %bpl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $6, %r9
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $7, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $8, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $9, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    movq %rax, %r9
+; AVX512DQ-NEXT:    shlq $10, %r9
+; AVX512DQ-NEXT:    orq %r10, %r9
+; AVX512DQ-NEXT:    movq %rax, %r10
+; AVX512DQ-NEXT:    shlq $11, %r10
+; AVX512DQ-NEXT:    orq %r9, %r10
+; AVX512DQ-NEXT:    shlq $12, %rax
+; AVX512DQ-NEXT:    orq %r10, %rax
+; AVX512DQ-NEXT:    movzbl %dil, %edi
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    movq %rdi, %r9
+; AVX512DQ-NEXT:    shlq $13, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $14, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %rdi, %r9
+; AVX512DQ-NEXT:    shlq $15, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $16, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    movq %rdi, %r9
+; AVX512DQ-NEXT:    shlq $17, %r9
+; AVX512DQ-NEXT:    orq %rax, %r9
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    shlq $18, %rax
+; AVX512DQ-NEXT:    orq %r9, %rax
+; AVX512DQ-NEXT:    shlq $19, %rdi
+; AVX512DQ-NEXT:    orq %rax, %rdi
+; AVX512DQ-NEXT:    movzbl %dl, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $20, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $21, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $22, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $23, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $24, %rdx
+; AVX512DQ-NEXT:    orq %rdi, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rdi
+; AVX512DQ-NEXT:    shlq $25, %rdi
+; AVX512DQ-NEXT:    orq %rdx, %rdi
+; AVX512DQ-NEXT:    shlq $26, %rax
+; AVX512DQ-NEXT:    orq %rdi, %rax
+; AVX512DQ-NEXT:    movzbl %cl, %ecx
+; AVX512DQ-NEXT:    andl $1, %ecx
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $27, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $28, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $29, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $30, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    movq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $31, %rdx
+; AVX512DQ-NEXT:    orq %rax, %rdx
+; AVX512DQ-NEXT:    movq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $32, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    shlq $33, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    andl $1, %r13d
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $34, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $35, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $36, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $37, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r13, %rax
+; AVX512DQ-NEXT:    shlq $38, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r13, %rcx
+; AVX512DQ-NEXT:    shlq $39, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $40, %r13
+; AVX512DQ-NEXT:    orq %rcx, %r13
+; AVX512DQ-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $41, %rcx
+; AVX512DQ-NEXT:    orq %r13, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $42, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $43, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $44, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    movq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $45, %rcx
+; AVX512DQ-NEXT:    orq %rdx, %rcx
+; AVX512DQ-NEXT:    movq %rax, %rdx
+; AVX512DQ-NEXT:    shlq $46, %rdx
+; AVX512DQ-NEXT:    orq %rcx, %rdx
+; AVX512DQ-NEXT:    shlq $47, %rax
+; AVX512DQ-NEXT:    orq %rdx, %rax
+; AVX512DQ-NEXT:    andl $1, %r15d
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $48, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $49, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $50, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $51, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r15, %rcx
+; AVX512DQ-NEXT:    shlq $52, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r15, %rax
+; AVX512DQ-NEXT:    shlq $53, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    shlq $54, %r15
+; AVX512DQ-NEXT:    orq %rax, %r15
+; AVX512DQ-NEXT:    andl $1, %r12d
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $55, %rax
+; AVX512DQ-NEXT:    orq %r15, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $56, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $57, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $58, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    movq %r12, %rax
+; AVX512DQ-NEXT:    shlq $59, %rax
+; AVX512DQ-NEXT:    orq %rcx, %rax
+; AVX512DQ-NEXT:    movq %r12, %rcx
+; AVX512DQ-NEXT:    shlq $60, %rcx
+; AVX512DQ-NEXT:    orq %rax, %rcx
+; AVX512DQ-NEXT:    shlq $61, %r12
+; AVX512DQ-NEXT:    orq %rcx, %r12
+; AVX512DQ-NEXT:    shlq $62, %rbx
+; AVX512DQ-NEXT:    orq %r12, %rbx
+; AVX512DQ-NEXT:    shlq $63, %r11
+; AVX512DQ-NEXT:    orq %rbx, %r11
+; AVX512DQ-NEXT:    orq %r8, %r11
+; AVX512DQ-NEXT:    movq %r11, 8(%rsi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor7_vf64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    pushq %r15
+; AVX512BW-NEXT:    pushq %r14
+; AVX512BW-NEXT:    pushq %r13
+; AVX512BW-NEXT:    pushq %r12
+; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-NEXT:    kshiftrq $13, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT:    kshiftrq $12, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ecx
+; AVX512BW-NEXT:    kshiftrq $11, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edx
+; AVX512BW-NEXT:    kshiftrq $10, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %edi
+; AVX512BW-NEXT:    kshiftrq $45, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrq $37, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $59, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrq $56, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrq $54, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r11d
+; AVX512BW-NEXT:    kshiftrq $4, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $1, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $2, %k0, %k1
+; AVX512BW-NEXT:    kmovd %k0, %r14d
+; AVX512BW-NEXT:    movzbl %r14b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    leaq (%r12,%r12,2), %r14
+; AVX512BW-NEXT:    leaq (%r14,%r12,4), %r14
+; AVX512BW-NEXT:    leaq (%r14,%r12,8), %r14
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $4, %r13
+; AVX512BW-NEXT:    orq %r14, %r13
+; AVX512BW-NEXT:    movq %r12, %r14
+; AVX512BW-NEXT:    shlq $5, %r14
+; AVX512BW-NEXT:    orq %r13, %r14
+; AVX512BW-NEXT:    shlq $6, %r12
+; AVX512BW-NEXT:    movzbl %r15b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $7, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $8, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $9, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $10, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $12, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $3, %k0, %k1
+; AVX512BW-NEXT:    shlq $13, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %r13b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $14, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $15, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $16, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $17, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $18, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $19, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $55, %k0, %k1
+; AVX512BW-NEXT:    shlq $20, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %r13b, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $21, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $22, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $23, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $24, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $25, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $26, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    shlq $27, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $28, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $29, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $30, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $5, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $32, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %rax, %r8
+; AVX512BW-NEXT:    shlq $33, %r8
+; AVX512BW-NEXT:    orq %r13, %r8
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $6, %k0, %k1
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %r8
+; AVX512BW-NEXT:    shlq $35, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %r12, %r8
+; AVX512BW-NEXT:    shlq $37, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %r12, %r8
+; AVX512BW-NEXT:    shlq $39, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r8d
+; AVX512BW-NEXT:    kshiftrq $7, %k0, %k1
+; AVX512BW-NEXT:    shlq $41, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r8, %r12
+; AVX512BW-NEXT:    shlq $43, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r8, %r12
+; AVX512BW-NEXT:    shlq $45, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r8, %r13
+; AVX512BW-NEXT:    shlq $47, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $8, %k0, %k1
+; AVX512BW-NEXT:    shlq $48, %r8
+; AVX512BW-NEXT:    orq %r13, %r8
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $49, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %r12, %r8
+; AVX512BW-NEXT:    shlq $50, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $51, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %r12, %r8
+; AVX512BW-NEXT:    shlq $52, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $53, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %r12, %r8
+; AVX512BW-NEXT:    shlq $54, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $9, %k0, %k1
+; AVX512BW-NEXT:    shlq $55, %r12
+; AVX512BW-NEXT:    orq %r8, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r8
+; AVX512BW-NEXT:    shlq $57, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %r13, %r8
+; AVX512BW-NEXT:    shlq $59, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %r8, %rax
+; AVX512BW-NEXT:    movq %r13, %r8
+; AVX512BW-NEXT:    shlq $61, %r8
+; AVX512BW-NEXT:    orq %rax, %r8
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $57, %k0, %k1
+; AVX512BW-NEXT:    shlq $62, %r13
+; AVX512BW-NEXT:    orq %r8, %r13
+; AVX512BW-NEXT:    movzbl %al, %r8d
+; AVX512BW-NEXT:    movq %r8, %rax
+; AVX512BW-NEXT:    shlq $63, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %rax, (%rsi)
+; AVX512BW-NEXT:    movzbl %r15b, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movzbl %r11b, %r14d
+; AVX512BW-NEXT:    movl %r14d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%r12,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r12,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r12,8), %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $4, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $5, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movzbl %r10b, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $11, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $13, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $58, %k0, %k1
+; AVX512BW-NEXT:    shlq $14, %r10
+; AVX512BW-NEXT:    orq %r12, %r10
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $15, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $16, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $17, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $18, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $19, %rax
+; AVX512BW-NEXT:    orq %r10, %rax
+; AVX512BW-NEXT:    movq %r12, %r10
+; AVX512BW-NEXT:    shlq $20, %r10
+; AVX512BW-NEXT:    orq %rax, %r10
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $63, %k0, %k1
+; AVX512BW-NEXT:    shlq $21, %r12
+; AVX512BW-NEXT:    orq %r10, %r12
+; AVX512BW-NEXT:    movzbl %al, %r10d
+; AVX512BW-NEXT:    andl $1, %r10d
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $23, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $25, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r10, %r12
+; AVX512BW-NEXT:    shlq $27, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    shlq $28, %r10
+; AVX512BW-NEXT:    orq %r12, %r10
+; AVX512BW-NEXT:    movzbl %r9b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    shlq $29, %r12
+; AVX512BW-NEXT:    orq %r10, %r12
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $30, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r10d
+; AVX512BW-NEXT:    kshiftrq $60, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %r12
+; AVX512BW-NEXT:    shlq $32, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $34, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $61, %k0, %k1
+; AVX512BW-NEXT:    shlq $35, %r9
+; AVX512BW-NEXT:    orq %r13, %r9
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r12, %r9
+; AVX512BW-NEXT:    shlq $37, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r12, %r9
+; AVX512BW-NEXT:    shlq $39, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $41, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrq $62, %k0, %k1
+; AVX512BW-NEXT:    shlq $42, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $43, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r9, %r12
+; AVX512BW-NEXT:    shlq $44, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $45, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r9, %r12
+; AVX512BW-NEXT:    shlq $46, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $47, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $48, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $36, %k0, %k1
+; AVX512BW-NEXT:    shlq $49, %r9
+; AVX512BW-NEXT:    orq %r13, %r9
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r12, %r9
+; AVX512BW-NEXT:    shlq $51, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r12, %r9
+; AVX512BW-NEXT:    shlq $53, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r12, %r9
+; AVX512BW-NEXT:    shlq $55, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    shlq $56, %r12
+; AVX512BW-NEXT:    orq %r9, %r12
+; AVX512BW-NEXT:    movl %r10d, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $57, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r9
+; AVX512BW-NEXT:    shlq $58, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $59, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movq %r13, %r9
+; AVX512BW-NEXT:    shlq $60, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $61, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r9d
+; AVX512BW-NEXT:    kshiftrq $38, %k0, %k1
+; AVX512BW-NEXT:    movzbl %r10b, %r10d
+; AVX512BW-NEXT:    shlq $62, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    shlq $63, %r10
+; AVX512BW-NEXT:    orq %r13, %r10
+; AVX512BW-NEXT:    orq %r15, %r10
+; AVX512BW-NEXT:    movq %r10, 48(%rsi)
+; AVX512BW-NEXT:    movzbl %r9b, %r10d
+; AVX512BW-NEXT:    movl %r10d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r15
+; AVX512BW-NEXT:    leaq (%r15,%rax,4), %rax
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    leaq (%rax,%r13,8), %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $4, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $5, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $7, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $39, %k0, %k1
+; AVX512BW-NEXT:    shlq $9, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $13, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $15, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $40, %k0, %k1
+; AVX512BW-NEXT:    shlq $16, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $17, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $18, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $19, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $20, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $21, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $22, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $41, %k0, %k1
+; AVX512BW-NEXT:    shlq $23, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movzbl %al, %ebp
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $24, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbp, %r15
+; AVX512BW-NEXT:    shlq $25, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %rbp, %r15
+; AVX512BW-NEXT:    shlq $27, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $28, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %rbp, %r13
+; AVX512BW-NEXT:    shlq $29, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $42, %k0, %k1
+; AVX512BW-NEXT:    shlq $30, %rbp
+; AVX512BW-NEXT:    orq %r13, %rbp
+; AVX512BW-NEXT:    movzbl %r15b, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    shlq $31, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $32, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $34, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %rbp
+; AVX512BW-NEXT:    shlq $36, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $43, %k0, %k1
+; AVX512BW-NEXT:    shlq $37, %r15
+; AVX512BW-NEXT:    orq %rbp, %r15
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $39, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $41, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $43, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $44, %k0, %k1
+; AVX512BW-NEXT:    shlq $44, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $45, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $46, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $47, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $48, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $49, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %rbp
+; AVX512BW-NEXT:    shlq $50, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $22, %k0, %k1
+; AVX512BW-NEXT:    shlq $51, %r15
+; AVX512BW-NEXT:    orq %rbp, %r15
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $53, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $55, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $57, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    shlq $58, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movl %ebx, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    shlq $59, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $60, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    movq %rax, %rbp
+; AVX512BW-NEXT:    shlq $61, %rbp
+; AVX512BW-NEXT:    orq %r13, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $46, %k0, %k1
+; AVX512BW-NEXT:    movzbl %bl, %ebx
+; AVX512BW-NEXT:    shlq $62, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $63, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $47, %k0, %k1
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movq %r13, 32(%rsi)
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    leaq (%rbx,%rbx,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r13,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r13,8), %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $4, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $5, %r12
+; AVX512BW-NEXT:    orq %rbx, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $7, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $48, %k0, %k1
+; AVX512BW-NEXT:    shlq $8, %r13
+; AVX512BW-NEXT:    orq %rbx, %r13
+; AVX512BW-NEXT:    movzbl %al, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $9, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $10, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $11, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $12, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $13, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $14, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $49, %k0, %k1
+; AVX512BW-NEXT:    shlq $15, %rbx
+; AVX512BW-NEXT:    orq %r13, %rbx
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $17, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $19, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $21, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $50, %k0, %k1
+; AVX512BW-NEXT:    shlq $22, %r13
+; AVX512BW-NEXT:    orq %rbx, %r13
+; AVX512BW-NEXT:    movzbl %al, %ebx
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $23, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $24, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $25, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $26, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $27, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $28, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $51, %k0, %k1
+; AVX512BW-NEXT:    shlq $29, %rbx
+; AVX512BW-NEXT:    orq %r13, %rbx
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $31, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $33, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $35, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $52, %k0, %k1
+; AVX512BW-NEXT:    shlq $36, %r13
+; AVX512BW-NEXT:    orq %rbx, %r13
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $37, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbp, %rbx
+; AVX512BW-NEXT:    shlq $38, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $39, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rbx
+; AVX512BW-NEXT:    shlq $40, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $41, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %rbp, %r13
+; AVX512BW-NEXT:    shlq $42, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrq $53, %k0, %k1
+; AVX512BW-NEXT:    shlq $43, %rbp
+; AVX512BW-NEXT:    orq %r13, %rbp
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $45, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $47, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    shlq $49, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $19, %k0, %k1
+; AVX512BW-NEXT:    shlq $50, %rbx
+; AVX512BW-NEXT:    orq %r13, %rbx
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $51, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $52, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $53, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $54, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $55, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r13, %rbx
+; AVX512BW-NEXT:    shlq $56, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $18, %k0, %k1
+; AVX512BW-NEXT:    shlq $57, %r13
+; AVX512BW-NEXT:    orq %rbx, %r13
+; AVX512BW-NEXT:    andl $1, %r11d
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r11, %rbx
+; AVX512BW-NEXT:    shlq $59, %rbx
+; AVX512BW-NEXT:    orq %rax, %rbx
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %rbx, %rax
+; AVX512BW-NEXT:    movq %r11, %r13
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebx
+; AVX512BW-NEXT:    kshiftrq $20, %k0, %k1
+; AVX512BW-NEXT:    shlq $62, %r11
+; AVX512BW-NEXT:    orq %r13, %r11
+; AVX512BW-NEXT:    shlq $63, %r14
+; AVX512BW-NEXT:    orq %r11, %r14
+; AVX512BW-NEXT:    orq %r12, %r14
+; AVX512BW-NEXT:    movq %r14, 40(%rsi)
+; AVX512BW-NEXT:    movzbl %bl, %r11d
+; AVX512BW-NEXT:    movl %r11d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    leaq (%rax,%rax,2), %r14
+; AVX512BW-NEXT:    leaq (%r14,%rax,4), %r14
+; AVX512BW-NEXT:    leaq (%r14,%rax,8), %r14
+; AVX512BW-NEXT:    shlq $4, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movzbl %bpl, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %r14
+; AVX512BW-NEXT:    shlq $5, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $10, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $21, %k0, %k1
+; AVX512BW-NEXT:    shlq $11, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movzbl %bpl, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $13, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $15, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $17, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $27, %k0, %k1
+; AVX512BW-NEXT:    shlq $18, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    movzbl %al, %r13d
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $19, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $20, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $21, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $22, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $23, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $24, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    shlq $25, %r13
+; AVX512BW-NEXT:    orq %r12, %r13
+; AVX512BW-NEXT:    movzbl %r15b, %ebp
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %rbp, %r12
+; AVX512BW-NEXT:    shlq $27, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $28, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %rbp, %r12
+; AVX512BW-NEXT:    shlq $29, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $23, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $24, %k0, %k1
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    shlq $32, %r15
+; AVX512BW-NEXT:    orq %rbp, %r15
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $33, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $34, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $35, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $36, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $37, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $38, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $25, %k0, %k1
+; AVX512BW-NEXT:    shlq $39, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $40, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $41, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $42, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $43, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $44, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %rbp
+; AVX512BW-NEXT:    shlq $45, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $26, %k0, %k1
+; AVX512BW-NEXT:    shlq $46, %r15
+; AVX512BW-NEXT:    orq %rbp, %r15
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $47, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $48, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $49, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %r15
+; AVX512BW-NEXT:    shlq $50, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $51, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $52, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $31, %k0, %k1
+; AVX512BW-NEXT:    shlq $53, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $54, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $55, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $56, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $57, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $58, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $59, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    shlq $60, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movl %r12d, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %r15, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $28, %k0, %k1
+; AVX512BW-NEXT:    movzbl %r12b, %r15d
+; AVX512BW-NEXT:    shlq $62, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $63, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $29, %k0, %k1
+; AVX512BW-NEXT:    orq %r14, %r12
+; AVX512BW-NEXT:    movq %r12, 16(%rsi)
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    leaq (%r15,%r15,2), %r14
+; AVX512BW-NEXT:    leaq (%r14,%r15,4), %r14
+; AVX512BW-NEXT:    leaq (%r14,%r15,8), %r14
+; AVX512BW-NEXT:    movzbl %al, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $4, %rax
+; AVX512BW-NEXT:    orq %r14, %rax
+; AVX512BW-NEXT:    movq %r15, %r14
+; AVX512BW-NEXT:    shlq $5, %r14
+; AVX512BW-NEXT:    orq %rax, %r14
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $6, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $7, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $8, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $9, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $30, %k0, %k1
+; AVX512BW-NEXT:    shlq $10, %r15
+; AVX512BW-NEXT:    orq %r12, %r15
+; AVX512BW-NEXT:    movzbl %al, %r12d
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $11, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $12, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $13, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $14, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $15, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $16, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    kmovd %k1, %eax
+; AVX512BW-NEXT:    kshiftrq $32, %k0, %k1
+; AVX512BW-NEXT:    shlq $17, %r12
+; AVX512BW-NEXT:    orq %r15, %r12
+; AVX512BW-NEXT:    movzbl %al, %r15d
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $19, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $20, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r12
+; AVX512BW-NEXT:    shlq $21, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $22, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $23, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $33, %k0, %k1
+; AVX512BW-NEXT:    shlq $24, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    movzbl %bpl, %ebp
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $25, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %rbp, %r15
+; AVX512BW-NEXT:    shlq $26, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $27, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %rbp, %r15
+; AVX512BW-NEXT:    shlq $28, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $29, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %rbp, %r15
+; AVX512BW-NEXT:    shlq $30, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    kmovd %k1, %r13d
+; AVX512BW-NEXT:    kshiftrq $34, %k0, %k1
+; AVX512BW-NEXT:    shlq $31, %rbp
+; AVX512BW-NEXT:    orq %r15, %rbp
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $33, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $35, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %rbp
+; AVX512BW-NEXT:    shlq $37, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $35, %k0, %k1
+; AVX512BW-NEXT:    shlq $38, %r12
+; AVX512BW-NEXT:    orq %rbp, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $39, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $40, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $41, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %r12
+; AVX512BW-NEXT:    shlq $42, %r12
+; AVX512BW-NEXT:    orq %rax, %r12
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $43, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rbp
+; AVX512BW-NEXT:    shlq $44, %rbp
+; AVX512BW-NEXT:    orq %rax, %rbp
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $14, %k0, %k1
+; AVX512BW-NEXT:    shlq $45, %r13
+; AVX512BW-NEXT:    orq %rbp, %r13
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $46, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $47, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $49, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %r13, %rax
+; AVX512BW-NEXT:    movq %r15, %r13
+; AVX512BW-NEXT:    shlq $51, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %ebp
+; AVX512BW-NEXT:    kshiftrq $15, %k0, %k1
+; AVX512BW-NEXT:    shlq $52, %r15
+; AVX512BW-NEXT:    orq %r13, %r15
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $53, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $54, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $55, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r15
+; AVX512BW-NEXT:    shlq $56, %r15
+; AVX512BW-NEXT:    orq %rax, %r15
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $57, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %r13
+; AVX512BW-NEXT:    shlq $58, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r15d
+; AVX512BW-NEXT:    kshiftrq $16, %k0, %k1
+; AVX512BW-NEXT:    shlq $59, %r12
+; AVX512BW-NEXT:    orq %r13, %r12
+; AVX512BW-NEXT:    andl $1, %r9d
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    shlq $60, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r9, %r13
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %rax, %r13
+; AVX512BW-NEXT:    kmovd %k1, %r12d
+; AVX512BW-NEXT:    kshiftrq $17, %k0, %k0
+; AVX512BW-NEXT:    shlq $62, %r9
+; AVX512BW-NEXT:    orq %r13, %r9
+; AVX512BW-NEXT:    kmovd %k0, %r13d
+; AVX512BW-NEXT:    shlq $63, %r10
+; AVX512BW-NEXT:    orq %r9, %r10
+; AVX512BW-NEXT:    orq %r14, %r10
+; AVX512BW-NEXT:    movq %r10, 24(%rsi)
+; AVX512BW-NEXT:    andl $1, %r8d
+; AVX512BW-NEXT:    leaq (%r8,%r8,2), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r8,4), %rax
+; AVX512BW-NEXT:    leaq (%rax,%r8,8), %rax
+; AVX512BW-NEXT:    movq %r8, %r9
+; AVX512BW-NEXT:    shlq $4, %r9
+; AVX512BW-NEXT:    orq %rax, %r9
+; AVX512BW-NEXT:    shlq $5, %r8
+; AVX512BW-NEXT:    orq %r9, %r8
+; AVX512BW-NEXT:    movzbl %dil, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $6, %rdi
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $7, %r9
+; AVX512BW-NEXT:    orq %rdi, %r9
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $8, %rdi
+; AVX512BW-NEXT:    orq %r9, %rdi
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $9, %r9
+; AVX512BW-NEXT:    orq %rdi, %r9
+; AVX512BW-NEXT:    movq %rax, %rdi
+; AVX512BW-NEXT:    shlq $10, %rdi
+; AVX512BW-NEXT:    orq %r9, %rdi
+; AVX512BW-NEXT:    movq %rax, %r9
+; AVX512BW-NEXT:    shlq $11, %r9
+; AVX512BW-NEXT:    orq %rdi, %r9
+; AVX512BW-NEXT:    shlq $12, %rax
+; AVX512BW-NEXT:    orq %r9, %rax
+; AVX512BW-NEXT:    movzbl %dl, %edx
+; AVX512BW-NEXT:    andl $1, %edx
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $13, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $14, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $15, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $16, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    shlq $17, %rdi
+; AVX512BW-NEXT:    orq %rax, %rdi
+; AVX512BW-NEXT:    movq %rdx, %rax
+; AVX512BW-NEXT:    shlq $18, %rax
+; AVX512BW-NEXT:    orq %rdi, %rax
+; AVX512BW-NEXT:    shlq $19, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movzbl %cl, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $20, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $21, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $22, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $23, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    shlq $24, %rcx
+; AVX512BW-NEXT:    orq %rdx, %rcx
+; AVX512BW-NEXT:    movq %rax, %rdx
+; AVX512BW-NEXT:    shlq $25, %rdx
+; AVX512BW-NEXT:    orq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $26, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; AVX512BW-NEXT:    andl $1, %ecx
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $27, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $28, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $29, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $30, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    movq %rcx, %rdx
+; AVX512BW-NEXT:    shlq $31, %rdx
+; AVX512BW-NEXT:    orq %rax, %rdx
+; AVX512BW-NEXT:    movq %rcx, %rax
+; AVX512BW-NEXT:    shlq $32, %rax
+; AVX512BW-NEXT:    orq %rdx, %rax
+; AVX512BW-NEXT:    shlq $33, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    andl $1, %ebp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $34, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rcx
+; AVX512BW-NEXT:    shlq $35, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $36, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rcx
+; AVX512BW-NEXT:    shlq $37, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    shlq $38, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rcx
+; AVX512BW-NEXT:    shlq $39, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $40, %rbp
+; AVX512BW-NEXT:    orq %rcx, %rbp
+; AVX512BW-NEXT:    andl $1, %r15d
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $41, %rax
+; AVX512BW-NEXT:    orq %rbp, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $42, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $43, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $44, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r15, %rax
+; AVX512BW-NEXT:    shlq $45, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r15, %rcx
+; AVX512BW-NEXT:    shlq $46, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $47, %r15
+; AVX512BW-NEXT:    orq %rcx, %r15
+; AVX512BW-NEXT:    andl $1, %r12d
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $48, %rax
+; AVX512BW-NEXT:    orq %r15, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $49, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $50, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $51, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r12, %rax
+; AVX512BW-NEXT:    shlq $52, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r12, %rcx
+; AVX512BW-NEXT:    shlq $53, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $54, %r12
+; AVX512BW-NEXT:    orq %rcx, %r12
+; AVX512BW-NEXT:    andl $1, %r13d
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $55, %rax
+; AVX512BW-NEXT:    orq %r12, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $56, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $57, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $58, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    shlq $59, %rax
+; AVX512BW-NEXT:    orq %rcx, %rax
+; AVX512BW-NEXT:    movq %r13, %rcx
+; AVX512BW-NEXT:    shlq $60, %rcx
+; AVX512BW-NEXT:    orq %rax, %rcx
+; AVX512BW-NEXT:    shlq $61, %r13
+; AVX512BW-NEXT:    orq %rcx, %r13
+; AVX512BW-NEXT:    andl $1, %ebx
+; AVX512BW-NEXT:    shlq $62, %rbx
+; AVX512BW-NEXT:    orq %r13, %rbx
+; AVX512BW-NEXT:    shlq $63, %r11
+; AVX512BW-NEXT:    orq %rbx, %r11
+; AVX512BW-NEXT:    orq %r8, %r11
+; AVX512BW-NEXT:    movq %r11, 8(%rsi)
+; AVX512BW-NEXT:    popq %rbx
+; AVX512BW-NEXT:    popq %r12
+; AVX512BW-NEXT:    popq %r13
+; AVX512BW-NEXT:    popq %r14
+; AVX512BW-NEXT:    popq %r15
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    retq
+  %src.vec = load <64 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+  store <448 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor8_vf2(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor8_vf2:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor8_vf2:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor8_vf2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kmovw %k0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %src.vec = load <2 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <2 x i1> %src.vec, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  store <16 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor8_vf4(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor8_vf4:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor8_vf4:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor8_vf4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %src.vec = load <4 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <4 x i1> %src.vec, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  store <32 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor8_vf8(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor8_vf8:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k3
+; AVX512F-ONLY-NEXT:    kmovw %k3, 6(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k2, 4(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor8_vf8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k3
+; AVX512DQ-NEXT:    kmovw %k3, 6(%rsi)
+; AVX512DQ-NEXT:    kmovw %k2, 4(%rsi)
+; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8:
+; AVX512BW-ONLY:       # %bb.0:
+; AVX512BW-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
+; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-ONLY-NEXT:    vzeroupper
+; AVX512BW-ONLY-NEXT:    retq
+;
+; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8:
+; AVX512VBMI-ONLY:       # %bb.0:
+; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k0
+; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512VBMI-ONLY-NEXT:    kmovq %k0, (%rsi)
+; AVX512VBMI-ONLY-NEXT:    vzeroupper
+; AVX512VBMI-ONLY-NEXT:    retq
+  %src.vec = load <8 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <8 x i1> %src.vec, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  store <64 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor8_vf16(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor8_vf16:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k3
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k4
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k5
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k7
+; AVX512F-ONLY-NEXT:    kmovw %k7, 12(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k6, 14(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k5, 8(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k4, 10(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k3, 4(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k2, 6(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor8_vf16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k3
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k5
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k7
+; AVX512DQ-NEXT:    kmovw %k7, 12(%rsi)
+; AVX512DQ-NEXT:    kmovw %k6, 14(%rsi)
+; AVX512DQ-NEXT:    kmovw %k5, 8(%rsi)
+; AVX512DQ-NEXT:    kmovw %k4, 10(%rsi)
+; AVX512DQ-NEXT:    kmovw %k3, 4(%rsi)
+; AVX512DQ-NEXT:    kmovw %k2, 6(%rsi)
+; AVX512DQ-NEXT:    kmovw %k1, 2(%rsi)
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor8_vf16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63]
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512BW-NEXT:    kmovq %k1, 8(%rsi)
+; AVX512BW-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %src.vec = load <16 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <16 x i1> %src.vec, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  store <128 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor8_vf32(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm4
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm10
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm12
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm14
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm15, %zmm0
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm3, %zmm3
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm5, %zmm5
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm7, %zmm7
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm9, %zmm9
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm11, %zmm11
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm13, %zmm13
+; AVX512F-ONLY-NEXT:    vpermd %zmm16, %zmm15, %zmm15
+; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 28(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 30(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 24(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 26(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 20(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 22(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 16(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 18(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 12(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 14(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 8(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 10(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 4(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 6(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor8_vf32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm14
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm15, %zmm0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm16
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm11, %zmm11
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm13, %zmm13
+; AVX512DQ-NEXT:    vpermd %zmm16, %zmm15, %zmm15
+; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 28(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 30(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 24(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 26(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 20(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 22(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 16(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 18(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 12(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 14(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 8(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 10(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 4(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 6(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor8_vf32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k3
+; AVX512BW-NEXT:    kmovq %k3, 24(%rsi)
+; AVX512BW-NEXT:    kmovq %k2, 16(%rsi)
+; AVX512BW-NEXT:    kmovq %k1, 8(%rsi)
+; AVX512BW-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %src.vec = load <32 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <32 x i1> %src.vec, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  store <256 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+
+define void @mask_replication_factor8_vf64(ptr %in.vec, ptr %out.vec) nounwind {
+; AVX512F-ONLY-LABEL: mask_replication_factor8_vf64:
+; AVX512F-ONLY:       # %bb.0:
+; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 2(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 4(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-ONLY-NEXT:    kmovw 6(%rdi), %k1
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm4
+; AVX512F-ONLY-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm10
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm12
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm14
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm15, %zmm16
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm17, %zmm4
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm3, %zmm18
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm5, %zmm19
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm7, %zmm20
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm9, %zmm21
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm11, %zmm22
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm13, %zmm23
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm15, %zmm24
+; AVX512F-ONLY-NEXT:    vpermd %zmm1, %zmm17, %zmm1
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm3, %zmm25
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm5, %zmm26
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm7, %zmm27
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm9, %zmm28
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm11, %zmm29
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm13, %zmm30
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm15, %zmm31
+; AVX512F-ONLY-NEXT:    vpermd %zmm2, %zmm17, %zmm2
+; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm3, %zmm3
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm5, %zmm5
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm7, %zmm7
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm9, %zmm9
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm11, %zmm11
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm13, %zmm13
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm15, %zmm15
+; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm17, %zmm0
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 60(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm15, %zmm15, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 62(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm13, %zmm13, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 56(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm11, %zmm11, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 58(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm9, %zmm9, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 52(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm7, %zmm7, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 54(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm5, %zmm5, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 48(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 50(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 44(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm31, %zmm31, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 46(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm30, %zmm30, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 40(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm29, %zmm29, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 42(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm28, %zmm28, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 36(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm27, %zmm27, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 38(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm26, %zmm26, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 32(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm25, %zmm25, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 34(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 28(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm24, %zmm24, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 30(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm23, %zmm23, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 24(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm22, %zmm22, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 26(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm21, %zmm21, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 20(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm20, %zmm20, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 22(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm19, %zmm19, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 16(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm18, %zmm18, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 18(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 12(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm16, %zmm16, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 14(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm14, %zmm14, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 8(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm12, %zmm12, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 10(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm10, %zmm10, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 4(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm8, %zmm8, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 6(%rsi)
+; AVX512F-ONLY-NEXT:    vptestmd %zmm6, %zmm6, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, (%rsi)
+; AVX512F-ONLY-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512F-ONLY-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-ONLY-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512F-ONLY-NEXT:    vzeroupper
+; AVX512F-ONLY-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_replication_factor8_vf64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm2
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k0
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm4
+; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm6
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm8
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm14
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm15, %zmm16
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm17, %zmm4
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm3, %zmm18
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm5, %zmm19
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm7, %zmm20
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm9, %zmm21
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm11, %zmm22
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm13, %zmm23
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm15, %zmm24
+; AVX512DQ-NEXT:    vpermd %zmm1, %zmm17, %zmm1
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm3, %zmm25
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm5, %zmm26
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm7, %zmm27
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm9, %zmm28
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm11, %zmm29
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm13, %zmm30
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm15, %zmm31
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm17, %zmm2
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm7, %zmm7
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm9, %zmm9
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm11, %zmm11
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm13, %zmm13
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm15, %zmm15
+; AVX512DQ-NEXT:    vpermd %zmm0, %zmm17, %zmm0
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 60(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm15, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 62(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm13, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 56(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm11, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 58(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm9, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 52(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm7, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 54(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm5, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 48(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 50(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 44(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm31, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 46(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm30, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 40(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm29, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 42(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm28, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 36(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm27, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 38(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm26, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 32(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm25, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 34(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 28(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm24, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 30(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm23, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 24(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm22, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 26(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm21, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 20(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm20, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 22(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm19, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 16(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm18, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 18(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 12(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm16, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 14(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm14, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 8(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm12, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 10(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm10, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 4(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm8, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 6(%rsi)
+; AVX512DQ-NEXT:    vpmovd2m %zmm6, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rsi)
+; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 2(%rsi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_replication_factor8_vf64:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k2
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k3
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k4
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k5
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k6
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k7
+; AVX512BW-NEXT:    kmovq %k7, 48(%rsi)
+; AVX512BW-NEXT:    kmovq %k6, 56(%rsi)
+; AVX512BW-NEXT:    kmovq %k5, 32(%rsi)
+; AVX512BW-NEXT:    kmovq %k4, 40(%rsi)
+; AVX512BW-NEXT:    kmovq %k3, 16(%rsi)
+; AVX512BW-NEXT:    kmovq %k2, 24(%rsi)
+; AVX512BW-NEXT:    kmovq %k1, 8(%rsi)
+; AVX512BW-NEXT:    kmovq %k0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  %src.vec = load <64 x i1>, ptr %in.vec, align 64
+  %tgt.mask = shufflevector <64 x i1> %src.vec, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
+  store <512 x i1> %tgt.mask, ptr %out.vec, align 64
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}
+; FALLBACK0: {{.*}}
+; FALLBACK1: {{.*}}
+; FALLBACK2: {{.*}}
+; FALLBACK3: {{.*}}
+; FALLBACK4: {{.*}}
+; FALLBACK5: {{.*}}
+; FALLBACK6: {{.*}}
+; FALLBACK7: {{.*}}


        


More information about the llvm-commits mailing list