[llvm] 8ac00ca - [X86] lowerShuffleWithUndefHalf - don't split vXi8 unary shuffles if the 128-bit source lanes are already in place (#122919)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 15 00:20:01 PST 2025
Author: Simon Pilgrim
Date: 2025-01-15T08:19:54Z
New Revision: 8ac00ca4867835cacaf013f5c442658b9b1bce38
URL: https://github.com/llvm/llvm-project/commit/8ac00ca4867835cacaf013f5c442658b9b1bce38
DIFF: https://github.com/llvm/llvm-project/commit/8ac00ca4867835cacaf013f5c442658b9b1bce38.diff
LOG: [X86] lowerShuffleWithUndefHalf - don't split vXi8 unary shuffles if the 128-bit source lanes are already in place (#122919)
Allows us to use PSHUFB to shuffle the lanes, and then perform a sub-lane permutation down to the lower half
Fixes #116815
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/test/CodeGen/X86/trunc-vector-width.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d4152ff4a816c4..90e3e15b1fb46c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15672,12 +15672,16 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
(!isSingleSHUFPSMask(HalfMask) ||
Subtarget.hasFastVariableCrossLaneShuffle()))
return SDValue();
- // If this is a unary shuffle (assume that the 2nd operand is
+ // If this is an unary shuffle (assume that the 2nd operand is
// canonicalized to undef), then we can use vpermpd. Otherwise, we
// are better off extracting the upper half of 1 operand and using a
// narrow shuffle.
if (EltWidth == 64 && V2.isUndef())
return SDValue();
+ // If this is an unary vXi8 shuffle with inplace halves, then perform as
+ // full width pshufb, and then merge.
+ if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
+ return SDValue();
}
// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
if (Subtarget.hasAVX512() && VT.is512BitVector())
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index 9642e5e4c9f868..26af46263c0e2c 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -699,16 +699,13 @@ define <16 x i8> @evenelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwi
;
; AVX2-LABEL: evenelts_v32i16_shuffle_v16i16_to_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4]
+; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -783,16 +780,13 @@ define <16 x i8> @oddelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwin
;
; AVX2-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4]
+; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index f0f02f1ed890ae..ec442c185706cf 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -275,53 +275,45 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT: vpsrld $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
+; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT: vpsrld $8, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
+; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/trunc-vector-width.ll b/llvm/test/CodeGen/X86/trunc-vector-width.ll
index bc6969c5cd37a6..42cc624b5a5359 100644
--- a/llvm/test/CodeGen/X86/trunc-vector-width.ll
+++ b/llvm/test/CodeGen/X86/trunc-vector-width.ll
@@ -4,14 +4,16 @@
define void @test(ptr %a0) #0 {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqu (%rdi), %xmm0
-; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,5,5,0,0,1,1,u,u,u,u,u,u,u,u]
-; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovdqu (%rdi), %ymm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,0,0]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = mem[0],ymm0[1,2,3,4,5,6,7]
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; CHECK-NEXT: vpextrb $1, %xmm0, (%rax)
; CHECK-NEXT: vpextrb $4, %xmm0, (%rax)
; CHECK-NEXT: vpextrb $8, %xmm0, (%rax)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%load = load <64 x i8>, ptr %a0, align 1
%shuf = shufflevector <64 x i8> %load, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 01181d4b21d9d7..abef980277ecea 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -849,146 +849,122 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-LABEL: load_i8_stride4_vf16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
-; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovdqa (%rdi), %ymm2
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4]
+; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm6
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm5
+; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm5
; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX2-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX2-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX2-NEXT: vmovdqa %xmm6, (%rcx)
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
+; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX2-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX2-NEXT: vmovdqa %xmm3, (%rdx)
+; AVX2-NEXT: vmovdqa %xmm5, (%rcx)
; AVX2-NEXT: vmovdqa %xmm0, (%r8)
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i8_stride4_vf16:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm3
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm1
+; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4]
+; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm5
+; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX2-FP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm6
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm5
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX2-FP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX2-FP-NEXT: vmovdqa %xmm6, (%rcx)
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX2-FP-NEXT: vmovdqa %xmm3, (%rdx)
+; AVX2-FP-NEXT: vmovdqa %xmm5, (%rcx)
; AVX2-FP-NEXT: vmovdqa %xmm0, (%r8)
+; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i8_stride4_vf16:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4]
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm5
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm6
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm5
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rcx)
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rdx)
+; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rcx)
; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r8)
+; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i8_stride4_vf16:
@@ -1446,228 +1422,198 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-LABEL: load_i8_stride4_vf32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
-; AVX2-NEXT: vmovdqa (%rdi), %xmm2
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7
-; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8
-; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm9
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3]
-; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
-; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
-; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7
-; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
-; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm6
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
+; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6
+; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm7
+; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm7
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm5
+; AVX2-NEXT: vpermd %ymm5, %ymm2, %ymm5
+; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm7
+; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm8
+; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm8
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
-; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
-; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
-; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
-; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm10
-; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm9
+; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6
+; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm8
+; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8
+; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm9
+; AVX2-NEXT: vpermd %ymm9, %ymm2, %ymm9
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9
+; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11
-; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm12
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm11
-; AVX2-NEXT: vpermd %ymm11, %ymm6, %ymm11
-; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm10
-; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
-; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %xmm10, %xmm4, %xmm4
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0
-; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX2-NEXT: vmovdqa %ymm9, (%rcx)
+; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7
+; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vpshufb %ymm8, %ymm4, %ymm4
+; AVX2-NEXT: vpermd %ymm4, %ymm2, %ymm4
+; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vmovdqa %ymm5, (%rsi)
+; AVX2-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX2-NEXT: vmovdqa %ymm7, (%rcx)
; AVX2-NEXT: vmovdqa %ymm0, (%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i8_stride4_vf32:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1
-; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2
-; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm5
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm7
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm6
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm8
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm9
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm9
-; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
-; AVX2-FP-NEXT: vpermd %ymm9, %ymm6, %ymm9
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm7
-; AVX2-FP-NEXT: vpermd %ymm7, %ymm6, %ymm7
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FP-NEXT: vpshufb %xmm8, %xmm5, %xmm9
-; AVX2-FP-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm6
+; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
+; AVX2-FP-NEXT: vpermd %ymm6, %ymm2, %ymm6
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm7
+; AVX2-FP-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm7
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpermd %ymm5, %ymm2, %ymm5
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm7
+; AVX2-FP-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm3, %ymm8
+; AVX2-FP-NEXT: vpermd %ymm8, %ymm2, %ymm8
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm8
+; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm11
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
-; AVX2-FP-NEXT: vpshufb %ymm9, %ymm1, %ymm10
-; AVX2-FP-NEXT: vpermd %ymm10, %ymm6, %ymm10
-; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm9
-; AVX2-FP-NEXT: vpermd %ymm9, %ymm6, %ymm9
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm5, %xmm10
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm9
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX2-FP-NEXT: vpermd %ymm6, %ymm2, %ymm6
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm8
+; AVX2-FP-NEXT: vpermd %ymm8, %ymm2, %ymm8
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm3, %ymm9
+; AVX2-FP-NEXT: vpermd %ymm9, %ymm2, %ymm9
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm9
+; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm12
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm11
-; AVX2-FP-NEXT: vpermd %ymm11, %ymm6, %ymm11
-; AVX2-FP-NEXT: vpshufb %ymm10, %ymm0, %ymm10
-; AVX2-FP-NEXT: vpermd %ymm10, %ymm6, %ymm10
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm5, %xmm5
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm4
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
-; AVX2-FP-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpermd %ymm0, %ymm6, %ymm0
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX2-FP-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX2-FP-NEXT: vmovdqa %ymm9, (%rcx)
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm7
+; AVX2-FP-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm4, %ymm4
+; AVX2-FP-NEXT: vpermd %ymm4, %ymm2, %ymm4
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX2-FP-NEXT: vpermd %ymm3, %ymm2, %ymm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa %ymm5, (%rsi)
+; AVX2-FP-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx)
; AVX2-FP-NEXT: vmovdqa %ymm0, (%r8)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i8_stride4_vf32:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2
-; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm5
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm6
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm8
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm9
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
-; AVX2-FCP-NEXT: vpermd %ymm9, %ymm6, %ymm9
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm7
-; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
+; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm6
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm5
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm7
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm8
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm8
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm11
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
-; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm10
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm6, %ymm10
-; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm9
-; AVX2-FCP-NEXT: vpermd %ymm9, %ymm6, %ymm9
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm10
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm9
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm8
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm9
+; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm12
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm11
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm6, %ymm11
-; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm10
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm6, %ymm10
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm0
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rcx)
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm7
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm4
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rsi)
+; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx)
; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r8)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
@@ -2696,517 +2642,379 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-LABEL: load_i8_stride4_vf64:
; AVX2: # %bb.0:
-; AVX2-NEXT: subq $168, %rsp
-; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm2
-; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa (%rdi), %xmm4
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm1
-; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufb %xmm3, %xmm12, %xmm8
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm13, %xmm5, %xmm8
-; AVX2-NEXT: vpshufb %xmm13, %xmm4, %xmm9
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3]
-; AVX2-NEXT: vpshufb %ymm13, %ymm2, %ymm9
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4]
-; AVX2-NEXT: vpermd %ymm9, %ymm1, %ymm9
-; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm10
-; AVX2-NEXT: vpermd %ymm10, %ymm1, %ymm10
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 176(%rdi), %xmm8
-; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm10
-; AVX2-NEXT: vmovdqa 160(%rdi), %xmm9
-; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm3
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1]
-; AVX2-NEXT: vmovdqa 144(%rdi), %xmm0
-; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm10
-; AVX2-NEXT: vmovdqa 128(%rdi), %xmm0
-; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm14
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
-; AVX2-NEXT: vmovdqa 224(%rdi), %ymm11
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3]
-; AVX2-NEXT: vpshufb %ymm13, %ymm11, %ymm14
-; AVX2-NEXT: vpermd %ymm14, %ymm1, %ymm15
-; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm13
-; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13
-; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm13
-; AVX2-NEXT: vpshufb %xmm3, %xmm12, %xmm15
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm15, %xmm5, %xmm0
-; AVX2-NEXT: vmovdqa %xmm5, %xmm10
-; AVX2-NEXT: vpshufb %xmm15, %xmm4, %xmm2
-; AVX2-NEXT: vmovdqa %xmm4, %xmm14
-; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT: vpshufb %ymm15, %ymm6, %ymm2
-; AVX2-NEXT: vpermd %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vpshufb %ymm15, %ymm5, %ymm13
-; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa %xmm8, %xmm4
-; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm0
-; AVX2-NEXT: vmovdqa %xmm9, %xmm7
-; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm2
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-NEXT: vpshufb %xmm15, %xmm8, %xmm2
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-NEXT: vpshufb %xmm15, %xmm9, %xmm3
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX2-NEXT: vpshufb %ymm15, %ymm11, %ymm2
-; AVX2-NEXT: vpermd %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vpshufb %ymm15, %ymm3, %ymm3
-; AVX2-NEXT: vpermd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm3
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm13
-; AVX2-NEXT: vpshufb %xmm3, %xmm14, %xmm15
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm13
-; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13
-; AVX2-NEXT: vpshufb %ymm3, %ymm5, %ymm15
-; AVX2-NEXT: vpermd %ymm15, %ymm1, %ymm15
+; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX2-NEXT: vmovdqa 224(%rdi), %ymm4
+; AVX2-NEXT: vmovdqa (%rdi), %ymm7
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm9
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm5
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4]
+; AVX2-NEXT: vpermd %ymm5, %ymm3, %ymm5
+; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm11
+; AVX2-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT: vpshufb %ymm10, %ymm7, %ymm11
+; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
+; AVX2-NEXT: vpshufb %ymm10, %ymm6, %ymm12
+; AVX2-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpshufb %ymm10, %ymm4, %ymm11
+; AVX2-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-NEXT: vpshufb %ymm10, %ymm2, %ymm12
+; AVX2-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm12
+; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
+; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm10
+; AVX2-NEXT: vpermd %ymm10, %ymm3, %ymm10
+; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vpshufb %ymm12, %ymm9, %ymm11
+; AVX2-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-NEXT: vpshufb %ymm12, %ymm8, %ymm13
+; AVX2-NEXT: vpermd %ymm13, %ymm3, %ymm13
+; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-NEXT: vpshufb %ymm12, %ymm7, %ymm13
+; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
+; AVX2-NEXT: vpshufb %ymm12, %ymm6, %ymm14
+; AVX2-NEXT: vpermd %ymm14, %ymm3, %ymm14
+; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-NEXT: vpshufb %ymm12, %ymm4, %ymm13
+; AVX2-NEXT: vpermd %ymm13, %ymm3, %ymm13
+; AVX2-NEXT: vpshufb %ymm12, %ymm2, %ymm14
+; AVX2-NEXT: vpermd %ymm14, %ymm3, %ymm14
+; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-NEXT: vpshufb %ymm12, %ymm1, %ymm14
+; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
+; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm12
+; AVX2-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vpshufb %ymm14, %ymm9, %ymm13
+; AVX2-NEXT: vpermd %ymm13, %ymm3, %ymm13
+; AVX2-NEXT: vpshufb %ymm14, %ymm8, %ymm15
+; AVX2-NEXT: vpermd %ymm15, %ymm3, %ymm15
; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm2
-; AVX2-NEXT: vmovdqa %xmm4, %xmm14
-; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm15
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX2-NEXT: vmovdqa %ymm11, %ymm15
-; AVX2-NEXT: vpshufb %ymm3, %ymm11, %ymm2
-; AVX2-NEXT: vpermd %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload
-; AVX2-NEXT: vpshufb %ymm3, %ymm11, %ymm3
-; AVX2-NEXT: vpermd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm6
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vpshufb %xmm6, %xmm10, %xmm5
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vpermd %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpermd %ymm5, %ymm1, %ymm5
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm4
-; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX2-NEXT: vpshufb %xmm6, %xmm8, %xmm4
-; AVX2-NEXT: vpshufb %xmm6, %xmm9, %xmm5
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
-; AVX2-NEXT: vpshufb %ymm6, %ymm15, %ymm4
-; AVX2-NEXT: vpshufb %ymm6, %ymm11, %ymm5
-; AVX2-NEXT: vpermd %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpermd %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpshufb %ymm14, %ymm7, %ymm15
+; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm5
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1]
+; AVX2-NEXT: vpshufb %ymm14, %ymm6, %ymm15
+; AVX2-NEXT: vpermd %ymm15, %ymm3, %ymm15
+; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-NEXT: vpshufb %ymm14, %ymm4, %ymm5
+; AVX2-NEXT: vpermd %ymm5, %ymm3, %ymm5
+; AVX2-NEXT: vpshufb %ymm14, %ymm2, %ymm15
+; AVX2-NEXT: vpermd %ymm15, %ymm3, %ymm15
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT: vpshufb %ymm14, %ymm1, %ymm15
+; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm10
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1]
+; AVX2-NEXT: vpshufb %ymm14, %ymm0, %ymm14
+; AVX2-NEXT: vpermd %ymm14, %ymm3, %ymm14
+; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vpshufb %ymm5, %ymm9, %ymm9
+; AVX2-NEXT: vpermd %ymm9, %ymm3, %ymm9
+; AVX2-NEXT: vpshufb %ymm5, %ymm8, %ymm8
+; AVX2-NEXT: vpermd %ymm8, %ymm3, %ymm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-NEXT: vpshufb %ymm5, %ymm7, %ymm7
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm6
+; AVX2-NEXT: vpermd %ymm6, %ymm3, %ymm6
+; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpermd %ymm4, %ymm3, %ymm4
+; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm1, (%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm1, (%rdx)
-; AVX2-NEXT: vmovdqa %ymm3, 32(%rcx)
+; AVX2-NEXT: vmovdqa %ymm12, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm11, (%rdx)
+; AVX2-NEXT: vmovdqa %ymm14, 32(%rcx)
; AVX2-NEXT: vmovdqa %ymm13, (%rcx)
; AVX2-NEXT: vmovdqa %ymm0, 32(%r8)
-; AVX2-NEXT: vmovdqa %ymm2, (%r8)
-; AVX2-NEXT: addq $168, %rsp
+; AVX2-NEXT: vmovdqa %ymm6, (%r8)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i8_stride4_vf64:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: subq $168, %rsp
-; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm2
-; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm4
-; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm1
-; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm8
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm5, %xmm8
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm9
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3]
-; AVX2-FP-NEXT: vpshufb %ymm13, %ymm2, %ymm9
-; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4]
-; AVX2-FP-NEXT: vpermd %ymm9, %ymm1, %ymm9
-; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm10
-; AVX2-FP-NEXT: vpermd %ymm10, %ymm1, %ymm10
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm8
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm10
-; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm9
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm3
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1]
-; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm0
-; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm10
-; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm0
-; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm14
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
-; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm11
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3]
-; AVX2-FP-NEXT: vpshufb %ymm13, %ymm11, %ymm14
-; AVX2-FP-NEXT: vpermd %ymm14, %ymm1, %ymm15
-; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0
-; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm13
-; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm13
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm15
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm0
-; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm10
-; AVX2-FP-NEXT: vpshufb %xmm15, %xmm4, %xmm2
-; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm14
-; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm15, %ymm6, %ymm2
-; AVX2-FP-NEXT: vpermd %ymm2, %ymm1, %ymm2
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm15, %ymm5, %ymm13
-; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm4
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm0
-; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm7
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm2
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm15, %xmm8, %xmm2
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm15, %xmm9, %xmm3
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX2-FP-NEXT: vpshufb %ymm15, %ymm11, %ymm2
-; AVX2-FP-NEXT: vpermd %ymm2, %ymm1, %ymm2
-; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm3
-; AVX2-FP-NEXT: vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm3
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm10, %xmm13
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm14, %xmm15
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm6, %ymm13
-; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm5, %ymm15
-; AVX2-FP-NEXT: vpermd %ymm15, %ymm1, %ymm15
+; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0
+; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm4
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm8
+; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm5
+; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4]
+; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm5
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
+; AVX2-FP-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm7, %ymm11
+; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm6, %ymm12
+; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm4, %ymm11
+; AVX2-FP-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm2, %ymm12
+; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm12
+; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm0, %ymm10
+; AVX2-FP-NEXT: vpermd %ymm10, %ymm3, %ymm10
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm9, %ymm11
+; AVX2-FP-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm8, %ymm13
+; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm13
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm7, %ymm13
+; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm6, %ymm14
+; AVX2-FP-NEXT: vpermd %ymm14, %ymm3, %ymm14
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm4, %ymm13
+; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm13
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm14
+; AVX2-FP-NEXT: vpermd %ymm14, %ymm3, %ymm14
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm14
+; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm12
+; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FP-NEXT: vpshufb %ymm14, %ymm9, %ymm13
+; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm13
+; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm15
+; AVX2-FP-NEXT: vpermd %ymm15, %ymm3, %ymm15
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm2
-; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm14
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm0
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm2
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm15
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm15
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm2
-; AVX2-FP-NEXT: vpermd %ymm2, %ymm1, %ymm2
-; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm3
-; AVX2-FP-NEXT: vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm6
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm5
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX2-FP-NEXT: vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX2-FP-NEXT: vpermd %ymm5, %ymm1, %ymm5
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm4
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm0
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm4
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm5
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
-; AVX2-FP-NEXT: vpshufb %ymm6, %ymm15, %ymm4
-; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm5
-; AVX2-FP-NEXT: vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FP-NEXT: vpermd %ymm5, %ymm1, %ymm1
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm14, %ymm7, %ymm15
+; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm5
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1]
+; AVX2-FP-NEXT: vpshufb %ymm14, %ymm6, %ymm15
+; AVX2-FP-NEXT: vpermd %ymm15, %ymm3, %ymm15
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm14, %ymm4, %ymm5
+; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm5
+; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm15
+; AVX2-FP-NEXT: vpermd %ymm15, %ymm3, %ymm15
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm15
+; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm10
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1]
+; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm14
+; AVX2-FP-NEXT: vpermd %ymm14, %ymm3, %ymm14
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm9, %ymm9
+; AVX2-FP-NEXT: vpermd %ymm9, %ymm3, %ymm9
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm8, %ymm8
+; AVX2-FP-NEXT: vpermd %ymm8, %ymm3, %ymm8
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm7, %ymm7
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm6
+; AVX2-FP-NEXT: vpermd %ymm6, %ymm3, %ymm6
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
+; AVX2-FP-NEXT: vpermd %ymm4, %ymm3, %ymm4
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-FP-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi)
; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
-; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rcx)
+; AVX2-FP-NEXT: vmovdqa %ymm12, 32(%rdx)
+; AVX2-FP-NEXT: vmovdqa %ymm11, (%rdx)
+; AVX2-FP-NEXT: vmovdqa %ymm14, 32(%rcx)
; AVX2-FP-NEXT: vmovdqa %ymm13, (%rcx)
; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%r8)
-; AVX2-FP-NEXT: vmovdqa %ymm2, (%r8)
-; AVX2-FP-NEXT: addq $168, %rsp
+; AVX2-FP-NEXT: vmovdqa %ymm6, (%r8)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i8_stride4_vf64:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: subq $168, %rsp
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4
-; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm1
-; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm8
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm8
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm9
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm9
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4]
-; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm9
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm10
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm10
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm8
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm10
-; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1]
-; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm10
-; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm14
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
-; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm11
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3]
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm14
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm15
-; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm13
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm13
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm15
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm0
-; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm10
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm2
-; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm14
-; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm2
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm13
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0
-; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm7
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm3
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm2
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2
-; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm3
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm13
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm15
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm13
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm15
-; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm15
+; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
+; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm7
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8
+; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm5
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4]
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm5
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
+; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm11
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm12
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm11
+; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm12
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm12
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm10
+; AVX2-FCP-NEXT: vpermd %ymm10, %ymm3, %ymm10
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm11
+; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm13
+; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm13
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm13
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm14
+; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm13
+; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm13
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm14
+; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm14
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm12
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm13
+; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm13
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm15
+; AVX2-FCP-NEXT: vpermd %ymm15, %ymm3, %ymm15
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm2
-; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm14
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm15
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm15
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm2
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2
-; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm3
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm6
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm5
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm5
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4
-; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm15
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm5
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1]
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm15
+; AVX2-FCP-NEXT: vpermd %ymm15, %ymm3, %ymm15
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm5
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm5
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm15
+; AVX2-FCP-NEXT: vpermd %ymm15, %ymm3, %ymm15
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm15
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm10
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1]
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm14
+; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm9
+; AVX2-FCP-NEXT: vpermd %ymm9, %ymm3, %ymm9
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm3, %ymm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6
+; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm4
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
-; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%rcx)
+; AVX2-FCP-NEXT: vmovdqa %ymm12, 32(%rdx)
+; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rdx)
+; AVX2-FCP-NEXT: vmovdqa %ymm14, 32(%rcx)
; AVX2-FCP-NEXT: vmovdqa %ymm13, (%rcx)
; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%r8)
-; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r8)
-; AVX2-FCP-NEXT: addq $168, %rsp
+; AVX2-FCP-NEXT: vmovdqa %ymm6, (%r8)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index e7bb02db627534..ac14f55e3f0ed0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -6395,203 +6395,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-LABEL: load_i8_stride5_vf64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
-; AVX512-NEXT: vmovdqa64 (%rdi), %ymm23
-; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm24
-; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm21
-; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm22
+; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm25
+; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22
+; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm23
; AVX512-NEXT: vmovdqa %ymm5, %ymm4
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22))
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4))
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm6
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4))
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
+; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm7
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
-; AVX512-NEXT: vmovdqa %ymm4, %ymm7
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24))
-; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
-; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm12
-; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6
-; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm25
-; AVX512-NEXT: vmovdqa 224(%rdi), %ymm7
-; AVX512-NEXT: vmovdqa %ymm4, %ymm9
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25))
-; AVX512-NEXT: vmovdqa 208(%rdi), %xmm8
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8))
-; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovdqa 176(%rdi), %xmm9
-; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
-; AVX512-NEXT: vmovdqa 160(%rdi), %xmm11
-; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT: vmovdqa %ymm4, %ymm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25))
+; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
+; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm10
+; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
+; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26
+; AVX512-NEXT: vmovdqa 224(%rdi), %ymm8
+; AVX512-NEXT: vmovdqa %ymm4, %ymm11
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26))
+; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9))
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa 160(%rdi), %ymm12
+; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
+; AVX512-NEXT: vpermd %ymm12, %ymm17, %ymm15
; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13
-; AVX512-NEXT: vmovdqa 144(%rdi), %xmm13
-; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm10
-; AVX512-NEXT: vmovdqa 128(%rdi), %xmm14
-; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512-NEXT: vpor %xmm10, %xmm15, %xmm10
-; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12))
-; AVX512-NEXT: vmovdqa 256(%rdi), %ymm15
-; AVX512-NEXT: vmovdqa 288(%rdi), %ymm12
-; AVX512-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15))
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero
-; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
+; AVX512-NEXT: vmovdqa 144(%rdi), %xmm12
+; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm6
+; AVX512-NEXT: vmovdqa 128(%rdi), %xmm13
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512-NEXT: vpor %xmm6, %xmm11, %xmm6
+; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
+; AVX512-NEXT: vmovdqa 256(%rdi), %ymm14
+; AVX512-NEXT: vmovdqa 288(%rdi), %ymm11
+; AVX512-NEXT: vmovdqa %ymm5, %ymm10
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
+; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
+; AVX512-NEXT: vpor %xmm0, %xmm10, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19
; AVX512-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12))
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6
+; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
-; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8))
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm10, %xmm3
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2
+; AVX512-NEXT: vmovdqa %ymm5, %ymm6
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9))
+; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa 160(%rdi), %xmm15
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa 176(%rdi), %xmm6
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10
; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
; AVX512-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0))
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u]
-; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
-; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm3
+; AVX512-NEXT: vmovdqa %ymm5, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
+; AVX512-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
+; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1))
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18
; AVX512-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm4, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX512-NEXT: vmovdqa %ymm10, %ymm3
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23))
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
-; AVX512-NEXT: vpor %xmm6, %xmm3, %xmm3
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
+; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3))
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20
; AVX512-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX512-NEXT: vmovdqa %ymm4, %ymm3
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
-; AVX512-NEXT: vpor %xmm6, %xmm3, %xmm3
+; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3
; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
-; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
+; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm3
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9))
; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24))
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4))
; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4
; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5]
-; AVX512-NEXT: vpermd %ymm4, %ymm5, %ymm4
+; AVX512-NEXT: vpermd %ymm4, %ymm17, %ymm4
; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1
-; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi)
-; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx)
-; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi)
+; AVX512-NEXT: vmovdqa64 %zmm18, (%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx)
; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
; AVX512-NEXT: vmovdqa64 %zmm1, (%r9)
; AVX512-NEXT: vzeroupper
@@ -6600,203 +6600,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-LABEL: load_i8_stride5_vf64:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm23
-; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm24
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21
-; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm22
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22
+; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23
; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4))
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4))
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm12
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm7
-; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm9
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25))
-; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm9
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm11
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26
+; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8
+; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm11
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26))
+; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm9
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
+; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13
-; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm13
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm10
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm14
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm10, %xmm15, %xmm10
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12))
-; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm15
-; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm12
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
+; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm12
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm14
+; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11
+; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm10
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19
; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2
+; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm15
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10
; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0))
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18
; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20
; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9))
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24))
; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4))
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5]
-; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1
-; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
; AVX512-FCP-NEXT: vzeroupper
@@ -6805,203 +6805,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-LABEL: load_i8_stride5_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm23
-; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm24
-; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm21
-; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm22
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm25
+; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22
+; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm23
; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4))
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm6
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4))
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm7
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
-; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm7
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
-; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm12
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6
-; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm25
-; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm7
-; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm9
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25))
-; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm9
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm11
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
+; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm10
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
+; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26
+; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm8
+; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm11
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26))
+; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm12
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-NEXT: vpermd %ymm12, %ymm17, %ymm15
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13
-; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm13
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm10
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm14
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512DQ-NEXT: vpor %xmm10, %xmm15, %xmm10
-; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12))
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm15
-; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm12
-; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero
-; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
+; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm12, %xmm6
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm13
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512DQ-NEXT: vpor %xmm6, %xmm11, %xmm6
+; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm14
+; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm11
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm10
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
+; AVX512DQ-NEXT: vpor %xmm0, %xmm10, %xmm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19
; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
-; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpor %xmm6, %xmm0, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm10, %xmm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm6
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm15
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm6
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10
; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0))
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
-; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm3
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18
; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20
; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
-; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
+; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm3
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9))
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24))
; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4))
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5]
-; AVX512DQ-NEXT: vpermd %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vpermd %ymm4, %ymm17, %ymm4
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1
-; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi)
-; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9)
; AVX512DQ-NEXT: vzeroupper
@@ -7010,203 +7010,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-LABEL: load_i8_stride5_vf64:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm24
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21
-; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23
; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4))
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4))
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm9
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25))
-; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26
+; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26))
+; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13
-; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm13
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm15, %xmm10
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12))
-; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm10
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19
; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm15
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10
; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0))
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18
; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20
; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9))
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24))
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4))
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5]
-; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
@@ -7231,163 +7231,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
-; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm10
+; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm9
; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000
; AVX512BW-NEXT: kmovd %eax, %k5
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5
-; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm4
-; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm5
+; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
; AVX512BW-NEXT: movl $4228, %eax # imm = 0x1084
; AVX512BW-NEXT: kmovd %eax, %k3
-; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3}
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm6
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
-; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm7
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
-; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3}
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5]
+; AVX512BW-NEXT: vpermd %ymm8, %ymm19, %ymm8
; AVX512BW-NEXT: movl $127, %eax
; AVX512BW-NEXT: kmovd %eax, %k4
-; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4}
-; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm12
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11]
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9
-; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
-; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5}
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11
-; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm9
-; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm8
-; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2}
-; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero
-; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18
+; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4}
+; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm11
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
+; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8
+; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5}
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10
+; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm7
+; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
+; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
+; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13
+; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20
; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A
; AVX512BW-NEXT: kmovd %eax, %k3
-; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000
; AVX512BW-NEXT: kmovd %eax, %k6
-; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6}
-; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u]
-; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1]
+; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
+; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm14
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
; AVX512BW-NEXT: movl $8456, %eax # imm = 0x2108
; AVX512BW-NEXT: kmovd %eax, %k6
-; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6}
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15
-; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
-; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15
-; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11
-; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5}
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm11
-; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1}
-; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero
-; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm10
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16
+; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15
+; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5}
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15
+; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1}
+; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero
+; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
; AVX512BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000
; AVX512BW-NEXT: kmovd %eax, %k4
-; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19
-; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
+; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4}
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14
+; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1]
; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000
; AVX512BW-NEXT: kmovd %eax, %k6
-; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6}
-; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3}
-; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u]
-; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1]
+; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6}
+; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3}
+; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u]
+; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1]
; AVX512BW-NEXT: movl $16912, %eax # imm = 0x4210
; AVX512BW-NEXT: kmovd %eax, %k6
-; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6}
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm10
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
-; AVX512BW-NEXT: vporq %xmm14, %xmm16, %xmm14
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10
-; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5}
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm15, %ymm10
-; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero
-; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13]
-; AVX512BW-NEXT: vporq %xmm16, %xmm14, %xmm14
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4}
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14
-; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
+; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6}
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm4
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512BW-NEXT: vporq %xmm15, %xmm17, %xmm15
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4
+; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5}
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4
+; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13]
+; AVX512BW-NEXT: vporq %xmm17, %xmm15, %xmm15
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4}
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15
+; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1]
; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000
; AVX512BW-NEXT: kmovd %eax, %k5
-; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5}
-; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u]
-; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15
+; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5}
+; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u]
+; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm16
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u]
+; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16
; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000
; AVX512BW-NEXT: kmovd %eax, %k5
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1]
; AVX512BW-NEXT: movl $33825, %eax # imm = 0x8421
; AVX512BW-NEXT: kmovd %eax, %k5
-; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5}
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm11
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
-; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11
-; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
+; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5}
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm9
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4
; AVX512BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
; AVX512BW-NEXT: kmovq %rax, %k5
-; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5}
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11
-; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3}
-; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero
-; AVX512BW-NEXT: vpor %xmm13, %xmm12, %xmm12
-; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512BW-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4}
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
+; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5}
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9
+; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
+; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
+; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
@@ -7400,36 +7401,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1]
+; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1]
; AVX512BW-NEXT: movl $2114, %eax # imm = 0x842
; AVX512BW-NEXT: kmovd %eax, %k2
-; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2}
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2}
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5]
-; AVX512BW-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vpermd %ymm2, %ymm19, %ymm2
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5}
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; AVX512BW-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero
-; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm3
+; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm3
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rcx)
-; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx)
+; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -7453,163 +7453,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm10
+; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9
; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000
; AVX512BW-FCP-NEXT: kmovd %eax, %k5
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5
-; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm4
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
; AVX512BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084
; AVX512BW-FCP-NEXT: kmovd %eax, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm7
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5]
+; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8
; AVX512BW-FCP-NEXT: movl $127, %eax
; AVX512BW-FCP-NEXT: kmovd %eax, %k4
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4}
-; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11]
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm13
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5}
-; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11
-; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9
-; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm8
-; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4}
+; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5}
+; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10
+; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7
+; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20
; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A
; AVX512BW-FCP-NEXT: kmovd %eax, %k3
-; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
; AVX512BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000
; AVX512BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
; AVX512BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108
; AVX512BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5}
-; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm11
-; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5}
+; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15
+; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
; AVX512BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000
; AVX512BW-FCP-NEXT: kmovd %eax, %k4
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19
-; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14
+; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1]
; AVX512BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000
; AVX512BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1]
; AVX512BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210
; AVX512BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm10
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5}
-; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm10
-; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13]
-; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14
-; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5}
+; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4
+; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13]
+; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15
+; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1]
; AVX512BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000
; AVX512BW-FCP-NEXT: kmovd %eax, %k5
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u]
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000
; AVX512BW-FCP-NEXT: kmovd %eax, %k5
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1]
; AVX512BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421
; AVX512BW-FCP-NEXT: kmovd %eax, %k5
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm11
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4
; AVX512BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
; AVX512BW-FCP-NEXT: kmovq %rax, %k5
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5}
-; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11
-; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5}
+; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9
+; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
@@ -7622,36 +7623,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1]
; AVX512BW-FCP-NEXT: movl $2114, %eax # imm = 0x842
; AVX512BW-FCP-NEXT: kmovd %eax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5]
-; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5}
; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4}
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -7675,163 +7675,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm10
+; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm9
; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000
; AVX512DQ-BW-NEXT: kmovd %eax, %k5
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm5
-; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm4
-; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm5
+; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
; AVX512DQ-BW-NEXT: movl $4228, %eax # imm = 0x1084
; AVX512DQ-BW-NEXT: kmovd %eax, %k3
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm6
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm7
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-BW-NEXT: vpermd %ymm8, %ymm19, %ymm8
; AVX512DQ-BW-NEXT: movl $127, %eax
; AVX512DQ-BW-NEXT: kmovd %eax, %k4
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4}
-; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm12
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11]
-; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm13
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
-; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5}
-; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11
-; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm9
-; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm8
-; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4}
+; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm11
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
+; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm12
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm10, %xmm8
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5}
+; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10
+; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm7
+; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20
; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A
; AVX512DQ-BW-NEXT: kmovd %eax, %k3
-; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
; AVX512DQ-BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000
; AVX512DQ-BW-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm14
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
; AVX512DQ-BW-NEXT: movl $8456, %eax # imm = 0x2108
; AVX512DQ-BW-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
-; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11
-; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5}
-; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm11
-; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1}
-; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero
-; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm10
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm13
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15
+; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5}
+; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15
+; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1}
+; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero
+; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
; AVX512DQ-BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000
; AVX512DQ-BW-NEXT: kmovd %eax, %k4
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19
-; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4}
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14
+; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1]
; AVX512DQ-BW-NEXT: movl $138543104, %eax # imm = 0x8420000
; AVX512DQ-BW-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3}
-; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3}
+; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1]
; AVX512DQ-BW-NEXT: movl $16912, %eax # imm = 0x4210
; AVX512DQ-BW-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm10
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
-; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm16, %xmm14
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10
-; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5}
-; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm15, %ymm10
-; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13]
-; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4}
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14
-; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm4
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm17, %xmm15
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4
+; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5}
+; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4
+; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13]
+; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm15, %xmm15
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4}
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15
+; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1]
; AVX512DQ-BW-NEXT: movl $277086208, %eax # imm = 0x10840000
; AVX512DQ-BW-NEXT: kmovd %eax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u]
+; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm16
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16
; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000
; AVX512DQ-BW-NEXT: kmovd %eax, %k5
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1]
; AVX512DQ-BW-NEXT: movl $33825, %eax # imm = 0x8421
; AVX512DQ-BW-NEXT: kmovd %eax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm11
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm9
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4
; AVX512DQ-BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
; AVX512DQ-BW-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5}
-; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11
-; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm12, %xmm12
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4}
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5}
+; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9
+; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
@@ -7844,36 +7845,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1]
; AVX512DQ-BW-NEXT: movl $2114, %eax # imm = 0x842
; AVX512DQ-BW-NEXT: kmovd %eax, %k2
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5]
-; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm19, %ymm2
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5}
; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm3
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm3
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4}
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rsi)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rcx)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rsi)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r8)
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r9)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
@@ -7897,163 +7897,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9
; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8
; AVX512DQ-BW-FCP-NEXT: movl $127, %eax
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20
; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
; AVX512DQ-BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4
; AVX512DQ-BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
@@ -8066,36 +8067,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: movl $2114, %eax # imm = 0x842
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5}
; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4}
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index 9ce685f13e4766..f87126a98eea41 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -7354,12 +7354,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-LABEL: load_i8_stride6_vf64:
; AVX512: # %bb.0:
; AVX512-NEXT: subq $40, %rsp
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm25
; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26
; AVX512-NEXT: vmovdqa %ymm12, %ymm0
; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25))
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4
@@ -7608,12 +7608,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-LABEL: load_i8_stride6_vf64:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: subq $40, %rsp
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25
; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26
; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm0
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25))
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
@@ -7862,12 +7862,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-LABEL: load_i8_stride6_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: subq $40, %rsp
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm25
; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26
; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25))
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4
@@ -8116,12 +8116,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-LABEL: load_i8_stride6_vf64:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: subq $40, %rsp
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25
; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26
; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm0
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25))
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
@@ -8370,12 +8370,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-LABEL: load_i8_stride6_vf64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm23
; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924
; AVX512BW-NEXT: kmovd %r10d, %k1
; AVX512BW-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1}
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm9, %xmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm12
@@ -8606,12 +8606,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-LABEL: load_i8_stride6_vf64:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23
; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924
; AVX512BW-FCP-NEXT: kmovd %r10d, %k1
; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12
@@ -8842,12 +8842,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-LABEL: load_i8_stride6_vf64:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm23
; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924
; AVX512DQ-BW-NEXT: kmovd %r10d, %k1
; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm9, %xmm1
; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm12
@@ -9078,12 +9078,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf64:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23
; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924
; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1
; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index bea6219b9fbacb..5ab09194c5b831 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -12121,414 +12121,399 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-FCP-LABEL: load_i8_stride7_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm19
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm10
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm10 ^ (ymm1 & (ymm19 ^ ymm10))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm27 ^ ymm30))
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7,8,9],ymm9[10],ymm2[11,12],ymm9[13],ymm2[14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm1 & mem)
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm31
-; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm29
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm31 ^ (ymm1 & (ymm29 ^ ymm31))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm20
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm20 ^ ymm12))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm31 ^ ymm27))
+; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28
+; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm30 ^ ymm28))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18
-; AVX512-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm3
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm2 & (zmm20 ^ zmm4))
-; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11
-; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %ymm26
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm26 ^ ymm11))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u]
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm13
-; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm16
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm5
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm15 ^ (ymm5 & (ymm16 ^ ymm15))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4,5],ymm14[6],ymm5[7,8,9],ymm14[10],ymm5[11,12,13],ymm14[14],ymm5[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm13 & ymm21)
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm10 ^ (ymm13 & (ymm19 ^ ymm10))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3
-; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm1
-; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm30 ^ ymm27))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7,8,9],ymm9[10],ymm13[11,12,13],ymm9[14],ymm13[15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm8
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm21 & (zmm8 ^ zmm1))
+; AVX512-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm11
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm11 ^ ymm16))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm14
+; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm14 ^ (ymm7 & (ymm2 ^ ymm14))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm7[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7,8,9],ymm13[10],ymm7[11,12,13],ymm13[14],ymm7[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm0 & ymm26)
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17
+; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm15
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm23 & (ymm15 ^ ymm13))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm8))
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm20 ^ ymm12))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm13
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm27 ^ ymm31))
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7,8,9],ymm6[10],ymm13[11,12,13],ymm6[14],ymm13[15]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm3 & ~mem)
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm31 ^ (ymm3 & (ymm29 ^ ymm31))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6]
-; AVX512-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm23
-; AVX512-FCP-NEXT: vmovdqa %xmm0, %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ~mem)
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm30 ^ ymm28))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6]
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm15
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm15[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm21 & (zmm7 ^ zmm13))
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm11 ^ (ymm8 & (ymm16 ^ ymm11))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u]
+; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm13
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26)
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10]
+; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm23 & (ymm8 ^ ymm13))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7))
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm20 ^ ymm12))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm22
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm2 & (zmm22 ^ zmm13))
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm19 ^ ymm10))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm27 ^ (ymm3 & (ymm30 ^ ymm27))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm29 ^ (ymm2 & (ymm31 ^ ymm29))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,5,6]
-; AVX512-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm25)
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm13, %ymm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm22 & (zmm3 ^ zmm8))
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm11 ^ (ymm7 & (ymm16 ^ ymm11))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u]
; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm23
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm18 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm18 & (zmm23 ^ zmm3))
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm19 ^ ymm10))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm27 ^ (ymm3 & (ymm30 ^ ymm27))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17)
-; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm29 ^ (ymm2 & (ymm31 ^ ymm29))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm8
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm8, %xmm13, %xmm8
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm7))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm14 ^ ymm2))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3],ymm8[4,5,6],ymm13[7,8],ymm8[9,10],ymm13[11],ymm8[12,13,14],ymm13[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm26)
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11]
; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
-; AVX512-FCP-NEXT: vmovdqa64 416(%rdi), %ymm24
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm28
-; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %ymm25
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm3 ^ (zmm18 & (zmm28 ^ zmm3))
-; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm5))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18
-; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm20))
-; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm11 ^ ymm26))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm15 ^ (ymm7 & (ymm16 ^ ymm15))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm21)
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm7))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm5 & (zmm20 ^ zmm22))
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm11 ^ ymm26))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm16 ^ (ymm7 & (ymm15 ^ ymm16))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3],ymm7[4,5,6],ymm8[7,8],ymm7[9,10],ymm8[11],ymm7[12,13,14],ymm8[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm21)
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm7))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm5 & (zmm22 ^ zmm23))
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm15 ^ ymm16))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3],ymm7[4],ymm3[5,6],ymm7[7,8],ymm3[9,10,11],ymm7[12],ymm3[13,14],ymm7[15]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm11 ^ ymm26))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm23 & (ymm7 ^ ymm8))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm21
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm3))
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm20 ^ ymm12))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm21) | ymm2
-; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm0 & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm5 & (zmm23 ^ zmm28))
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm15 ^ ymm16))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5,6],ymm6[7,8],ymm7[9,10],ymm6[11],ymm7[12,13,14],ymm6[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm25)
; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm26 ^ ymm11))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm21) | ymm2
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm0 & (ymm4 ^ ymm3))
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm21
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm16 ^ ymm15))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
-; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm26 ^ ymm11))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm0
+; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm15
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm3))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm22 & (zmm0 ^ zmm7))
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6],ymm4[7,8],ymm3[9,10,11],ymm4[12],ymm3[13,14],ymm4[15]
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm16 ^ ymm11))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u]
+; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm4))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm0))
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm14 ^ ymm2))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
+; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm26) | ymm0
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm23 & (ymm29 ^ ymm3))
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm2
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm28
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm28 = ymm28 ^ (ymm0 & (ymm28 ^ ymm3))
-; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm10 ^ ymm19))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = ymm24 ^ (ymm21 & (ymm25 ^ ymm24))
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19))
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm2 ^ ymm14))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16))
; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm7
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm19 ^ (ymm6 & (ymm10 ^ ymm19))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm27 ^ (ymm7 & (ymm30 ^ ymm27))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm10
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm11 ^ (ymm12 & (ymm26 ^ ymm11))
-; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm30 ^ (ymm6 & (ymm27 ^ ymm30))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm27 ^ ymm30))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm0
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm26
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm23 & (ymm26 ^ ymm3))
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm20 ^ (ymm13 & (ymm12 ^ ymm20))
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17))
; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3],ymm9[4],ymm7[5,6],ymm9[7,8],ymm7[9,10,11],ymm9[12],ymm7[13,14],ymm9[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm2 & ymm17)
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6,7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13,14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm3 & ymm17)
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ymm17)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm29 ^ (ymm10 & (ymm31 ^ ymm29))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm12 ^ ymm20))
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm20 ^ (ymm10 & (ymm12 ^ ymm20))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm12, %xmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm12
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm12
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm9 & (ymm11 ^ ymm16))
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm31 ^ ymm27))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[4,11],zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm4, %xmm10, %xmm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6],ymm6[7,8],ymm7[9,10,11],ymm6[12],ymm7[13,14],ymm6[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm3 & ymm25)
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13,14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm0 & ymm25)
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7,8],ymm6[9],ymm9[10,11,12],ymm6[13],ymm9[14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm25)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm1 & (ymm2 ^ ymm14))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm28 ^ ymm30))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm11 & (ymm3 ^ ymm2))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = ymm15 ^ (ymm14 & (ymm16 ^ ymm15))
-; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm29 ^ (ymm13 & (ymm31 ^ ymm29))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm29 ^ ymm31))
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm2
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm9))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm10
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm9 & (ymm3 ^ ymm0))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm9 & (ymm4 ^ ymm0))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm11 & (ymm5 ^ ymm9))
-; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm13
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm9
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm11
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm13 & (ymm11 ^ ymm9))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm9 & (ymm5 ^ ymm1))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,2,4,6,0,0,0,0]
+; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8
+; AVX512-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm10))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,4,6,0,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm3 & (zmm4 ^ zmm7))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,6,0,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm6))
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm3 & (zmm6 ^ zmm7))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm3
; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm3
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm16[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u]
-; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm26, %xmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0
-; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1}
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm3
+; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1}
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & mem) | ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3
+; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1}
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -12961,413 +12946,405 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-FCP-LABEL: load_i8_stride7_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm26
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm11 ^ (ymm1 & (ymm26 ^ ymm11))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: pushq %rax
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm11 ^ ymm12))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm29
+; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31
; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm29 ^ ymm30))
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7,8,9],ymm9[10],ymm2[11,12],ymm9[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm1 & mem)
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24
-; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm31 ^ ymm24))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm27 ^ (ymm2 & (ymm31 ^ ymm27))
+; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28
+; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18
-; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm2 & (zmm20 ^ zmm4))
-; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %ymm19
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm10 ^ (ymm4 & (ymm19 ^ ymm10))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm13
-; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm16
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm5
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm15 ^ (ymm5 & (ymm16 ^ ymm15))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4,5],ymm14[6],ymm5[7,8,9],ymm14[10],ymm5[11,12,13],ymm14[14],ymm5[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm13 & ymm23)
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm26 ^ ymm11))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm29 ^ (ymm13 & (ymm30 ^ ymm29))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7,8,9],ymm9[10],ymm13[11,12,13],ymm9[14],ymm13[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm3 & ~mem)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm31 ^ ymm24))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm22
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm21
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm13))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm26 ^ ymm11))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm30 ^ ymm29))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm24 ^ ymm31))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22
+; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm22
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm18 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm3 ^ (zmm18 & (zmm22 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm2))
+; AVX512DQ-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm26 ^ ymm11))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm1 ^ ymm16))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm30 ^ ymm29))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm24 ^ ymm31))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm13, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4,5],ymm8[6],ymm13[7,8,9],ymm8[10],ymm13[11,12,13],ymm8[14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm10 & ymm26)
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm17 ^ (ymm13 & (ymm18 ^ ymm17))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm21 & (ymm13 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm29 & (zmm3 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7,8,9],ymm3[10],ymm8[11,12,13],ymm3[14],ymm8[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ~mem)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm28 ^ (ymm7 & (ymm30 ^ ymm28))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,4,6]
+; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm13, %ymm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm8))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm16 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm13
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10]
+; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm21 & (ymm8 ^ ymm13))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8,9,10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm23 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm23)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,5,6]
+; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm8, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm13 ^ (zmm22 & (zmm3 ^ zmm13))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm16 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm24
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm28
-; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm25
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm3 ^ (zmm18 & (zmm28 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm27 & (ymm3 ^ ymm5))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm20))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm14 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5,6],ymm15[7,8],ymm13[9,10],ymm15[11],ymm13[12,13,14],ymm15[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm26)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm15 ^ (ymm7 & (ymm16 ^ ymm15))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11]
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm21 & (ymm7 ^ ymm13))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm29 & (zmm20 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm11 ^ ymm12))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm27 & (ymm3 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm5 & (zmm20 ^ zmm21))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm16 ^ (ymm7 & (ymm15 ^ ymm16))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3],ymm7[4,5,6],ymm8[7,8],ymm7[9,10],ymm8[11],ymm7[12,13,14],ymm8[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm27 & (ymm3 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm5 & (zmm21 ^ zmm22))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm15 ^ ymm16))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3],ymm7[4],ymm3[5,6],ymm7[7,8],ymm3[9,10,11],ymm7[12],ymm3[13,14],ymm7[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm23) | ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm25 & (ymm3 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm22 & (zmm3 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm14 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm16 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm26) | ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm18 ^ (ymm4 & (ymm17 ^ ymm18))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm27 & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm5 & (zmm22 ^ zmm28))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm15 ^ ymm16))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm19 ^ ymm10))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm21 & (ymm4 ^ ymm5))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm22
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm23) | ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm27 & (ymm1 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm16 ^ ymm15))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm19 ^ ymm10))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm26
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm21 & (ymm26 ^ ymm4))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm14 ^ (ymm3 & (ymm2 ^ ymm14))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm28
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm28 = ymm28 ^ (ymm27 & (ymm28 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm26 ^ (ymm2 & (ymm11 ^ ymm26))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vporq %xmm3, %xmm2, %xmm27
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm23 = ymm24 ^ (ymm23 & (ymm25 ^ ymm24))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm26 ^ (ymm2 & (ymm11 ^ ymm26))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ~mem) | ymm3
; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm11 ^ ymm26))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm30 ^ ymm29))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm10 ^ (ymm12 & (ymm19 ^ ymm10))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm29 ^ ymm30))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm29 ^ ymm30))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2,3],ymm9[4],ymm3[5,6],ymm9[7,8],ymm3[9,10,11],ymm9[12],ymm3[13,14],ymm9[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm27 & ymm17)
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm2 & ymm17)
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm6 & ymm17)
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm0 ^ ymm31))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14]
+; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm29
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm21 & (ymm29 ^ ymm7))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm12 ^ ymm11))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm12 ^ ymm11))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm11 ^ (ymm6 & (ymm12 ^ ymm11))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm13
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm12
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm16 ^ (ymm9 & (ymm1 ^ ymm16))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm15
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm27 ^ (ymm15 & (ymm31 ^ ymm27))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vporq %xmm11, %xmm0, %xmm16
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6],ymm4[7,8],ymm8[9,10,11],ymm4[12],ymm8[13,14],ymm4[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ymm23)
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm3 & ymm23)
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7,8],ymm4[9],ymm9[10,11,12],ymm4[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm16 & ymm23)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm10 & (ymm2 ^ ymm14))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm28 ^ (ymm10 & (ymm30 ^ ymm28))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm9, %xmm3
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm11 & (ymm8 ^ ymm2))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = ymm15 ^ (ymm14 & (ymm16 ^ ymm15))
-; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm6 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm0 ^ ymm31))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm0 ^ (ymm14 & (ymm31 ^ ymm0))
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm8 & (zmm2 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm9
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm25 & (ymm9 ^ ymm3))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm7, %xmm3
; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm11 & (ymm5 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm13
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm25 & (ymm7 ^ ymm3))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm10
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm13 & (ymm11 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm9))
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm8 & (zmm5 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm25 & (ymm10 ^ ymm3))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,2,4,6,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm9, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm11))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,4,6,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm8))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,5,6,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm10, %zmm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm9 & (zmm8 ^ zmm0))
; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm28, %zmm0, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm16[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm26, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
-; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm23, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
+; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1}
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax)
+; AVX512DQ-FCP-NEXT: popq %rax
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -13743,29 +13720,29 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-FCP-LABEL: load_i8_stride7_vf64:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
+; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
-; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm16
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm24
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
-; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm17
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm25
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm18
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm10
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm4
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm4
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm12
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
; AVX512BW-FCP-NEXT: kmovq %k1, %k2
; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
@@ -13776,11 +13753,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9
; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm5 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm5 {%k1}
; AVX512BW-FCP-NEXT: kmovq %k1, %k3
; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
@@ -13789,285 +13766,285 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6]
-; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19
-; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm6, %ymm6
+; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %xmm8
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm5
; AVX512BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
; AVX512BW-FCP-NEXT: kmovq %rax, %k5
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm1 {%k5}
-; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7
+; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6
; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm5
; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448
; AVX512BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm9 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm9[u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm21
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80
; AVX512BW-FCP-NEXT: kmovd %eax, %k7
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm21 {%k7}
-; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7
; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm4
; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224
; AVX512BW-FCP-NEXT: kmovd %eax, %k4
-; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm4, %ymm20 {%k4}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm22
+; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm18 {%k4}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22
; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000
-; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF
; AVX512BW-FCP-NEXT: kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm10 {%k1}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,4,6]
-; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
+; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm10 {%k5}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm15
+; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
; AVX512BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00
; AVX512BW-FCP-NEXT: kmovd %r10d, %k5
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm20
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,5,6]
-; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm19
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm19, %xmm14
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm15, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
+; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
+; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm19
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12]
-; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm20
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23
-; AVX512BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000
-; AVX512BW-FCP-NEXT: kmovd %edi, %k2
+; AVX512BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000
+; AVX512BW-FCP-NEXT: kmovd %r10d, %k2
; AVX512BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1}
; AVX512BW-FCP-NEXT: kmovd %eax, %k3
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14
; AVX512BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
; AVX512BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k4}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm21
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k4}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm4, %ymm18 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10]
-; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm10 {%k2}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k1}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm4, %ymm17 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
-; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm9, %ymm16 {%k4}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2}
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm17
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
+; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k1}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
+; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm15 {%k4}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2}
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16
; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm16 {%k2}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm0 {%k4}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm14 {%k2}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm16 {%k4}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1}
; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm9, %ymm0 {%k1}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm9, %ymm0 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
-; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm0 {%k4}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm21 {%k6}
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k1}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
+; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k3}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[1,8,15],zero,zero,xmm21[4,11,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm13, %xmm21, %xmm13
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14]
-; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm22, %xmm21
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm13 {%k3}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm11, %xmm21
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm11, %xmm11
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k3}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15]
-; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm19
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm19, %ymm11 {%k3}
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm19, %zmm19
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
+; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k3}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
+; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k3}
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm21 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm21, %zmm21
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm0, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm21[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm21
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm12, %zmm12
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm2 {%k5}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm12, %xmm18, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm18 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm13, %zmm13
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k5}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0]
+; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20
+; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k5}
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm5 {%k1}
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5}
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k1}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u]
@@ -14075,13 +14052,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0
; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm4 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm4 {%k2}
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1}
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm0
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
@@ -14094,11 +14071,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rdi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -14472,29 +14449,29 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm25
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm5
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm10
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512DQ-BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k2
; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
@@ -14505,11 +14482,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9
; AVX512DQ-BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k3
; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
@@ -14518,315 +14495,315 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm6, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4
; AVX512DQ-BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6
; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4
; AVX512DQ-BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm9 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm9[u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm21
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm7 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k7
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm21 {%k7}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7
; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm5
; AVX512DQ-BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm5, %ymm20 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm22
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm18 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22
; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF
; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm10 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
; AVX512DQ-BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00
; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k5
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm19, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm15, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23
-; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000
-; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2
+; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000
+; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2
; AVX512DQ-BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1}
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14
; AVX512DQ-BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm0 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm21
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm5, %ymm18 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm5, %ymm17 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm0 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm9, %ymm16 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm15 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16
; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm0 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm14 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1}
; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm9, %ymm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm9, %ymm0 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm21 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm0 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm13, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm0 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm19, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm11 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm13, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm19, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm21, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm8, %xmm21
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm12, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm2 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm12, %xmm18, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm18 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm12 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm6, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm4 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm4 {%k1}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4
; AVX512DQ-BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm5 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm5 {%k1}
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm15, %zmm0, %zmm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rdi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <448 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 5b607748c57615..99932c0026b23a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -1364,90 +1364,55 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm6
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm8
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm9 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm10
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm14 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,5,7,5,7,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2
; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm6
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm6
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm7
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm8
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm7
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm10
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm9
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm9
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm7, (%r8)
-; AVX2-FCP-NEXT: vmovq %xmm8, (%r9)
-; AVX2-FCP-NEXT: vmovq %xmm9, (%r11)
-; AVX2-FCP-NEXT: vmovq %xmm10, (%r10)
-; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
+; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FCP-NEXT: vmovq %xmm8, (%rdx)
+; AVX2-FCP-NEXT: vmovq %xmm11, (%rcx)
+; AVX2-FCP-NEXT: vmovq %xmm3, (%r8)
+; AVX2-FCP-NEXT: vmovq %xmm4, (%r9)
+; AVX2-FCP-NEXT: vmovq %xmm5, (%r11)
+; AVX2-FCP-NEXT: vmovq %xmm6, (%r10)
+; AVX2-FCP-NEXT: vmovq %xmm1, (%rax)
+; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i8_stride8_vf8:
@@ -2663,182 +2628,97 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-FCP-LABEL: load_i8_stride8_vf16:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm8
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm6
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
-; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm10
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm9
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm12
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm6
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm3
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm1
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[3]
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm10
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2,3]
+; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm11
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm12
-; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm9
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm13
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm12
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm13
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm12
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm9[2,3]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm6
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm6[3]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm1
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm2
+; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm5
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm15
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3]
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm7
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm13
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm13
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm13
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm15
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm14
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm13
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm15
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm15
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm14
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm0
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm15
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm14
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm15
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm15
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm14
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm0
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm15
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm8
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm4
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm4
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsi)
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vmovaps %xmm2, (%rdx)
+; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rdx)
; AVX2-FCP-NEXT: vmovdqa %xmm10, (%rcx)
-; AVX2-FCP-NEXT: vmovdqa %xmm11, (%r8)
-; AVX2-FCP-NEXT: vmovdqa %xmm12, (%r9)
+; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r8)
+; AVX2-FCP-NEXT: vmovdqa %xmm5, (%r9)
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT: vmovdqa %xmm13, (%rax)
+; AVX2-FCP-NEXT: vmovdqa %xmm12, (%rax)
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax)
+; AVX2-FCP-NEXT: vmovdqa %xmm13, (%rax)
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rax)
+; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i8_stride8_vf16:
@@ -2962,114 +2842,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm9
+; AVX512-FCP-NEXT: vpmovqd %ymm9, %xmm8
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512-FCP-NEXT: vpmovqd %ymm10, %xmm11
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm12
+; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm1
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512-FCP-NEXT: vpsrlq $8, %zmm12, %zmm2
+; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm12, %zmm14
+; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX512-FCP-NEXT: vpsrlq $24, %zmm12, %zmm11
+; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3
+; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm12, %zmm4
+; AVX512-FCP-NEXT: vpmovqb %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512-FCP-NEXT: vpsrlq $40, %zmm12, %zmm5
+; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm5
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm12, %zmm6
+; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm7
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512-FCP-NEXT: vpsrlq $56, %zmm12, %zmm7
; AVX512-FCP-NEXT: vpmovqb %zmm7, %xmm7
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8
-; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9
-; AVX512-FCP-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10
-; AVX512-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11
-; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12
-; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1
-; AVX512-FCP-NEXT: vpmovqb %zmm1, %xmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %xmm9, (%r9)
-; AVX512-FCP-NEXT: vmovdqa %xmm10, (%r11)
-; AVX512-FCP-NEXT: vmovdqa %xmm11, (%r10)
-; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %xmm3, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %xmm4, (%r11)
+; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r10)
+; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -3194,114 +3037,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm9
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm9, %xmm8
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm10, %xmm11
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm12
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm1
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm12, %zmm2
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm12, %zmm14
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm12, %zmm11
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3
+; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm12, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm12, %zmm5
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm12, %zmm6
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm7
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm12, %zmm7
; AVX512DQ-FCP-NEXT: vpmovqb %zmm7, %xmm7
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, (%r11)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%r10)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%r11)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r10)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -3426,114 +3232,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512BW-FCP-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX512BW-FCP-NEXT: vpmovqd %ymm4, %xmm3
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0
+; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
+; AVX512BW-FCP-NEXT: vpmovqd %ymm6, %xmm7
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9
+; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm1
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm9, %zmm2
+; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
+; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm9, %zmm14
+; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
+; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm9, %zmm7
; AVX512BW-FCP-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4
+; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6
+; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
+; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm9, %zmm7
+; AVX512BW-FCP-NEXT: vpmovqb %zmm7, %xmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
+; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm7
+; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm8
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8
+; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm9, %zmm8
; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm8
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9
-; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10
+; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm8
+; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm9, %zmm10
; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11
-; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12
-; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vpmovqb %zmm1, %xmm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r9)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm10, (%r11)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm11, (%r10)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rax)
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
+; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm9, %zmm6
+; AVX512BW-FCP-NEXT: vpmovqb %zmm6, %xmm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r11)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -3658,114 +3427,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm6, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm9, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm9, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm9, %zmm7
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm9, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm8
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm9, %zmm8
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm8
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm9, %zmm10
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm10, (%r11)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm11, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm9, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r11)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <128 x i8>, ptr %in.vec, align 64
@@ -6063,305 +5795,180 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-FCP-LABEL: load_i8_stride8_vf32:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: subq $248, %rsp
-; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm4
-; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm7
-; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm8
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm1
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm13
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5
-; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
-; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm15
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm9
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
-; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm9
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm0
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
+; AVX2-FCP-NEXT: subq $136, %rsp
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm11
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm14
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vmovdqa %xmm10, %xmm4
-; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm3
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm4
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm7
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30]
-; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm9
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5
+; AVX2-FCP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm10
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm14
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm8
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm6
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm11
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm7
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm12
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm15
+; AVX2-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm5
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm5
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2
-; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm8
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm15, %xmm10
-; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm4
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0
-; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm9
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm2
-; AVX2-FCP-NEXT: vmovdqa %xmm6, %xmm8
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1
-; AVX2-FCP-NEXT: vmovdqa %xmm11, %xmm14
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm6
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,2,3,1,3,5,7]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm0
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2
+; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm0, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm0
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm9
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm13[3]
+; AVX2-FCP-NEXT: vpermd %ymm15, %ymm2, %ymm13
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm2, %ymm15
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm14[1],xmm4[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm2, (%r9)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx)
+; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8)
+; AVX2-FCP-NEXT: vmovdqa %ymm12, (%r9)
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax)
+; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax)
+; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
-; AVX2-FCP-NEXT: addq $248, %rsp
+; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rax)
+; AVX2-FCP-NEXT: addq $136, %rsp
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
@@ -6721,231 +6328,186 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-FCP-LABEL: load_i8_stride8_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm18
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm20
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16
-; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
-; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13
-; AVX512-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm9
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
-; AVX512-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm11
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512-FCP-NEXT: vpmovqb %zmm18, %xmm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm27
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512-FCP-NEXT: vpsrlq $8, %zmm18, %zmm8
-; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm31
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm18, %zmm8
-; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18
+; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm6
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
+; AVX512-FCP-NEXT: vpmovqd %ymm12, %xmm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm19
+; AVX512-FCP-NEXT: vpmovqd %ymm19, %xmm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512-FCP-NEXT: vpmovqb %zmm20, %xmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm4
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm4
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
-; AVX512-FCP-NEXT: vpsrlq $24, %zmm18, %zmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512-FCP-NEXT: vpsrlq $8, %zmm20, %zmm6
; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm4
-; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm9
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm6
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm15
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm14
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3]
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm18, %zmm14
-; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm4
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm14
-; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm7
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
-; AVX512-FCP-NEXT: vpsrlq $40, %zmm18, %zmm4
-; AVX512-FCP-NEXT: vpmovqb %zmm4, %xmm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm18, %zmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm31
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm30
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm28
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm5
+; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm27
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm6
+; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm26
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm20, %zmm6
+; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm25
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm24
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm1
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512-FCP-NEXT: vpsrlq $24, %zmm20, %zmm2
; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm9
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm14
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm14
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm20, %zmm15
+; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
+; AVX512-FCP-NEXT: vpsrlq $40, %zmm20, %zmm15
+; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm10
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm13
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm10
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm10
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm13
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13
+; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm15
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm20, %zmm15
+; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm13
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm13
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm14, %xmm5
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
+; AVX512-FCP-NEXT: vpsrlq $56, %zmm20, %zmm3
; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %ymm15, (%r9)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %ymm12, (%r9)
+; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
-; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -7305,231 +6867,186 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-FCP-LABEL: load_i8_stride8_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm18
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm20
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16
-; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
-; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm18, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm27
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm18, %zmm8
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm20
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm31
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm18, %zmm8
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18
+; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm12, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm19
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm19, %xmm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm20, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm18, %zmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm20, %zmm6
; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm4
-; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm14
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm18, %zmm14
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm18, %zmm4
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm31
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm30
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm28
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm27
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm26
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm20, %zmm6
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm25
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm24
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm1
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm20, %zmm2
; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm14
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm20, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm20, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm10
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm15
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm20, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm13
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm13
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm14, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm20, %zmm3
; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, (%r9)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, (%r9)
+; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
-; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -7837,214 +7354,169 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-FCP-LABEL: load_i8_stride8_vf32:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
-; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm9
-; AVX512BW-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm4
-; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm2
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
-; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm26
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm26, %ymm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm13
-; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm27
-; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm27, %ymm3
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
+; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
+; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm7
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm30
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
-; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm28
-; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm28, %ymm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm5
-; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm3
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm16 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm3, %xmm17
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm5, %xmm16
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm16[0],xmm17[0],xmm16[1],xmm17[1],xmm16[2],xmm17[2],xmm16[3],xmm17[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512BW-FCP-NEXT: vpmovqb %zmm0, %xmm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
+; AVX512BW-FCP-NEXT: vpmovqd %ymm12, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm16
+; AVX512BW-FCP-NEXT: vpmovqd %ymm16, %xmm0
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
+; AVX512BW-FCP-NEXT: vpmovqb %zmm4, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
; AVX512BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm16 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm4, %ymm6
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm26, %ymm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm27, %ymm7
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm3, %ymm9
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm28, %ymm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm7
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm20
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm8
-; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm29
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm7
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm26, %ymm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm2, %ymm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm9
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm0, %xmm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm4, %zmm10
+; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm18
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm27, %ymm8
+; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm7, %ymm6
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm28, %ymm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm24
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
-; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm10
-; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm4, %ymm4
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm26, %ymm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm26 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm27, %ymm8
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm28, %ymm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm28
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
-; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm10
-; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7]
-; AVX512BW-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm10
-; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm11
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm10[7]
-; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm10
-; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm10, %ymm13
-; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm8, %ymm15
-; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm13
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm12
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm13
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3]
-; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm13
+; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm3, %ymm10
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm10
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm0, %xmm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3]
+; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm4, %zmm13
; AVX512BW-FCP-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm9, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm11, %ymm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm10, %ymm13
-; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16
-; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm7
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm10
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3]
+; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm4, %zmm10
+; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11
+; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm13
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm12
+; AVX512BW-FCP-NEXT: vpermd %ymm16, %ymm5, %ymm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
+; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm4, %zmm14
; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm9, %ymm13
-; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm11, %ymm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm14
-; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm15, %ymm6
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16
-; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm11, %ymm14
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm13, %ymm9
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm12, %xmm9
+; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
+; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm4, %zmm14
; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm9, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm11, %ymm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm10, %ymm10
-; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512BW-FCP-NEXT: vmovaps %ymm1, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %ymm29, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm12, (%r11)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r10)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm9
+; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm8, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm14
+; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm13, %ymm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm9
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm15, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
+; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm4, %zmm14
+; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm10, %ymm9
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm8, %ymm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm9
+; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm13, %ymm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2
+; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %ymm2, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %ymm18, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r9)
+; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rax)
+; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rax)
+; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -8352,214 +7824,169 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf32:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm26
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm26, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm27
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm27, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm30
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm28
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm28, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm16 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm3, %xmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm5, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm16[0],xmm17[0],xmm16[1],xmm17[1],xmm16[2],xmm17[2],xmm16[3],xmm17[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm12, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm16
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm16, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm16 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm4, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm26, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm27, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm3, %ymm9
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm28, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm29
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm26, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm2, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm0, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm4, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm18
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm27, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm7, %ymm6
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm28, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm24
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm4, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm26, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm26 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm27, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm28, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm28
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm10[7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm10, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm8, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm3, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm0, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm4, %zmm13
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm9, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm11, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm10, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm4, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm16, %ymm5, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm4, %zmm14
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm9, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm11, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm15, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm11, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm13, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm12, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm4, %zmm14
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm9, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm11, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm10, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm1, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm29, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm12, (%r11)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm8, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm13, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm15, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm4, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm10, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm8, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm13, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm18, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rax)
+; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <256 x i8>, ptr %in.vec, align 64
@@ -13109,641 +12536,471 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-FCP-LABEL: load_i8_stride8_vf64:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: subq $904, %rsp # imm = 0x388
-; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm2
-; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX2-FCP-NEXT: vmovdqa 336(%rdi), %xmm15
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm2
-; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm4
-; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm14
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm4
-; AVX2-FCP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm5
-; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm5
-; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm6
-; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm6
-; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
-; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm5
-; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm4
+; AVX2-FCP-NEXT: subq $1096, %rsp # imm = 0x448
+; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm4
; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm5
+; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm5
; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm4[7]
-; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5
+; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm0
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm10
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm0
+; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm13
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm2
+; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm2
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm5
+; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
+; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm5
; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm4
; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm13
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm8
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5
; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm12
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm2
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm4
+; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm3
+; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
-; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm9
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
-; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm9
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4
-; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm10
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm15
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3]
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm11
; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm12
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm0
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm12
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm12
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm14
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm9
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm1
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm9
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm0
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm14
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm0
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm8
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm8
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm9
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm1
+; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm12
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm1
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm14
+; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14
+; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm8
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm15
+; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30]
+; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm13
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm1
+; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm1
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm14
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm15
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm9
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm14
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm7
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm14
+; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm12
+; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm11
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm10
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm0
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm6
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm6
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm6
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm6
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm7
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm0
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm1
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm3
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm5
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm7
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm8[0,1,2],xmm7[3]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm11
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm12
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm11
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm12
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,6],ymm11[7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm1
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm15
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5],ymm15[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7]
; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm12
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm9
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm6
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm7
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm7
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1],xmm5[2,3]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm7
+; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm14
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm2[3]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm0
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm14, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm11
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm12
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm13
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm14
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm15
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm1
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm9
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm9
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm9
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm10
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0
+; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm3
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm4
+; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm0
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm1
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm9
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm12
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm13
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm15
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm14
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm15
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm9
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm10
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm10
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm10
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm10
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm11
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm12
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm13
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm14
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm15
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm2
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm4
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm3
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi)
@@ -13778,7 +13035,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rax)
-; AVX2-FCP-NEXT: addq $904, %rsp # imm = 0x388
+; AVX2-FCP-NEXT: addq $1096, %rsp # imm = 0x448
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
@@ -14534,557 +13791,428 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-FCP-LABEL: load_i8_stride8_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $440, %rsp # imm = 0x1B8
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: subq $232, %rsp
+; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm2
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm2
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm3
+; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm12
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm3
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm5
-; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm13
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512-FCP-NEXT: vmovdqa 368(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm30
-; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm6
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm25
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512-FCP-NEXT: vmovdqa 336(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
-; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm15
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm8
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28
-; AVX512-FCP-NEXT: vpmovqb %zmm28, %xmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm19
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpmovqd %ymm2, %xmm4
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm17
+; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpmovqd %ymm3, %xmm12
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm21
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
+; AVX512-FCP-NEXT: vpmovqb %zmm29, %xmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12
; AVX512-FCP-NEXT: movb $-64, %al
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 {%k1}
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm7
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm20
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm31
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm29
-; AVX512-FCP-NEXT: vpmovqb %zmm29, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm26
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm19
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm21
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1
; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm13
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm5
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm0
-; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm15
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
-; AVX512-FCP-NEXT: vpsrlq $8, %zmm28, %zmm3
-; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm8
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm6
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm15
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 %xmm10, %xmm16
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm15
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm15
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm5
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512-FCP-NEXT: vpsrlq $8, %zmm29, %zmm3
-; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm20
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm5
-; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm17
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm15
-; AVX512-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm28, %zmm5
-; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm9
-; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm25
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm26
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm6
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm10, %xmm16
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm7
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm5
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm29, %zmm3
-; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1
-; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm15
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm12
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT: vpsrlq $24, %zmm28, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm24
-; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm9
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm4
-; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm21
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm16
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm6
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512-FCP-NEXT: vpsrlq $24, %zmm29, %zmm3
-; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31
+; AVX512-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm20
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm9
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm9
-; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm20
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm10
-; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm22
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm10
-; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm23
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm24, %zmm10
-; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm25
-; AVX512-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm13
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 {%k1}
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm30
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11
-; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm24
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm12
-; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm21
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm29, %zmm12
-; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm11
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm16
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm7
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm25
+; AVX512-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm10
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm14
+; AVX512-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm9
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm28
+; AVX512-FCP-NEXT: vpmovqd %ymm27, %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm15
+; AVX512-FCP-NEXT: vpmovqd %ymm28, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm7
+; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm18
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm30
+; AVX512-FCP-NEXT: vpmovqb %zmm30, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm7
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
+; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm24
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm22
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm2
; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm19
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512-FCP-NEXT: vpsrlq $40, %zmm25, %zmm14
+; AVX512-FCP-NEXT: vpsrlq $8, %zmm29, %zmm14
; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm15
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm14
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm14
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm16
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm17
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14
-; AVX512-FCP-NEXT: vmovdqa64 %xmm10, %xmm20
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm11
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14
-; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm22
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
-; AVX512-FCP-NEXT: vpsrlq $40, %zmm29, %zmm13
-; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm18
-; AVX512-FCP-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm29
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm1
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14
+; AVX512-FCP-NEXT: vmovdqa %xmm8, %xmm11
+; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm10
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512-FCP-NEXT: vpsrlq $8, %zmm30, %zmm15
+; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm18
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm22
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm23
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14
-; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm28
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15
-; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm16
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm12
+; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm14
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm25, %zmm14
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm29, %zmm14
; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm9
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm15
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm13
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm15
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm15
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm11
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm18, %zmm13
-; AVX512-FCP-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm21
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm30, %zmm15
+; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm8
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm11
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm12
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3]
-; AVX512-FCP-NEXT: vpsrlq $56, %zmm25, %zmm11
-; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5 {%k1}
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm4
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm8
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
+; AVX512-FCP-NEXT: vpsrlq $24, %zmm29, %zmm8
+; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm6 {%k1}
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512-FCP-NEXT: vpsrlq $24, %zmm30, %zmm1
+; AVX512-FCP-NEXT: vpmovqb %zmm1, %xmm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1
+; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm22
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7]
+; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2
+; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm3
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm29, %zmm3
; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm1, (%rsi)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm1, (%rdx)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm1, (%rcx)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm1, (%r8)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm1, (%r9)
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm17
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1}
+; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
+; AVX512-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpermd %ymm27, %ymm16, %ymm13
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0
+; AVX512-FCP-NEXT: vpermd %ymm28, %ymm16, %ymm12
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm30, %zmm15
+; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm5
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm25
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm10
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3]
+; AVX512-FCP-NEXT: vpsrlq $40, %zmm29, %zmm15
+; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm11
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm11
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm23
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm11
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm14
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512-FCP-NEXT: vpsrlq $40, %zmm30, %zmm15
+; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm26
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm24
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm14
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm29, %zmm14
+; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm9
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm14
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm30, %zmm15
+; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm10
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
+; AVX512-FCP-NEXT: vpsrlq $56, %zmm29, %zmm8
+; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm7 {%k1}
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm3
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
+; AVX512-FCP-NEXT: vpsrlq $56, %zmm30, %zmm3
+; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512-FCP-NEXT: addq $440, %rsp # imm = 0x1B8
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512-FCP-NEXT: addq $232, %rsp
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -15840,557 +14968,428 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-FCP-LABEL: load_i8_stride8_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $440, %rsp # imm = 0x1B8
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: subq $232, %rsp
+; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm2
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm3
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512DQ-FCP-NEXT: vmovdqa 368(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm30
-; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm25
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa 336(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
-; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm8
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm28, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm19
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17
+; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm3, %xmm12
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm21
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm29, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12
; AVX512DQ-FCP-NEXT: movb $-64, %al
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 {%k1}
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm27
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm20
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm31
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm29
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm29, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm26
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm19
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm21
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm13
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm5
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm15
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm28, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm8
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm15
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm16
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm5
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm29, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm17
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm15
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm28, %zmm5
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm25
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm26
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm16
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm29, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm28, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm24
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm21
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm16
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm6
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm29, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31
+; AVX512DQ-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm20
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm22
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm23
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm24, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm25
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm13
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 {%k1}
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm27
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm30
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm24
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm21
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm29, %zmm12
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm16
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm25
+; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm14
+; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm28
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm27, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm15
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm28, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm18
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm30
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm30, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm24
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm22
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm2
; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm25, %zmm14
+; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm29, %zmm14
; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm15
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm16
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm17
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm11
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm29, %zmm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm18
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm29
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm1
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, %xmm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm30, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm18
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm22
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm23
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm28
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm16
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm14
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm25, %zmm14
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm29, %zmm14
; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm15
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm11
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm18, %zmm13
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm21
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm30, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm12
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm25, %zmm11
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm4
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm8
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm29, %zmm8
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm30, %zmm1
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm22
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7]
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm3
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm29, %zmm3
; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%r8)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%r9)
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm17
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1}
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm16, %ymm13
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0
+; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm16, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm30, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm5
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm25
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm29, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm23
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm11
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm30, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm24
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm29, %zmm14
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm9
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm30, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm10
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm29, %zmm8
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm30, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-FCP-NEXT: addq $440, %rsp # imm = 0x1B8
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FCP-NEXT: addq $232, %rsp
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -17073,429 +16072,357 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-FCP-LABEL: load_i8_stride8_vf64:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: subq $328, %rsp # imm = 0x148
-; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: subq $264, %rsp # imm = 0x108
+; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm26
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
; AVX512BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30
-; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm1
+; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm1
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
-; AVX512BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm31
-; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm31, %ymm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm19
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm2
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm29
-; AVX512BW-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm3
+; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm12
+; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vmovdqa64 416(%rdi), %ymm22
+; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm20
+; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm20, %ymm2
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm27
+; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm3
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-FCP-NEXT: vmovdqa64 368(%rdi), %xmm21
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm21, %xmm2
-; AVX512BW-FCP-NEXT: vmovdqa 352(%rdi), %xmm4
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm3
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-FCP-NEXT: vmovdqa 336(%rdi), %xmm12
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm5
-; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm28
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm6
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
-; AVX512BW-FCP-NEXT: vpmovqb %zmm18, %xmm5
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20
+; AVX512BW-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24
+; AVX512BW-FCP-NEXT: vpmovqd %ymm24, %xmm18
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm23
+; AVX512BW-FCP-NEXT: vpmovqd %ymm23, %xmm17
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm17, %xmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
+; AVX512BW-FCP-NEXT: vpmovqb %zmm26, %xmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19
; AVX512BW-FCP-NEXT: movb $-64, %al
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 {%k1}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 {%k1}
; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm17
-; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm1
+; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm16
+; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm1
; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3
-; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm5
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6
-; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm27
-; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm16, %ymm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 112(%rdi), %xmm26
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm26, %xmm8
-; AVX512BW-FCP-NEXT: vmovdqa64 96(%rdi), %xmm24
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm24, %xmm7
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-FCP-NEXT: vmovdqa64 80(%rdi), %xmm22
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm25
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm23
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
-; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
+; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4
+; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31
+; AVX512BW-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm10
+; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm11
+; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21
+; AVX512BW-FCP-NEXT: vpermd %ymm21, %ymm0, %ymm7
+; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm0
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 96(%rdi), %ymm28
+; AVX512BW-FCP-NEXT: vpmovqd %ymm28, %xmm5
+; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm8
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512BW-FCP-NEXT: vpmovqd %ymm25, %xmm11
+; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8
+; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm0
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm31, %ymm13
+; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm12, %ymm13
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm13
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm15
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm13
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm15
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-FCP-NEXT: vmovdqa64 %xmm21, %xmm5
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm21, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm23
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm15[0],xmm23[1],xmm15[1],xmm23[2],xmm15[2],xmm23[3],xmm15[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-FCP-NEXT: vmovdqa %xmm12, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm25
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm20
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm20[0],xmm25[0],xmm20[1],xmm25[1],xmm20[2],xmm25[2],xmm20[3],xmm25[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3]
-; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm18, %zmm15
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm13
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
+; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm26, %zmm15
; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm4, %ymm15
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm26, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm24, %xmm13
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm13
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm15
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
-; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm10, %zmm13
-; AVX512BW-FCP-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm10, %ymm15
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm8, %zmm15
+; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm30, %ymm0
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm31, %ymm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm11
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13
-; AVX512BW-FCP-NEXT: vmovdqa %xmm4, %xmm1
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15
-; AVX512BW-FCP-NEXT: vmovdqa64 %xmm7, %xmm23
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm20
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm20[0],xmm15[0],xmm20[1],xmm15[1],xmm20[2],xmm15[2],xmm20[3],xmm15[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
-; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm18, %zmm15
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm13
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm13
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm17, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
+; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm26, %zmm15
; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm17, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm3, %ymm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm15
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm13
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm10, %zmm11
-; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm15
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm8, %zmm15
+; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm31, %ymm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm19, %ymm9
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm20, %ymm12
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-FCP-NEXT: vmovdqa64 %xmm5, %xmm19
-; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm11
-; AVX512BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20
-; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3]
-; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm18, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25
-; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm17, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm16, %ymm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm26, %xmm1
-; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm24, %xmm2
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm22, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm3
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm12
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm17, %xmm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
+; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm26, %zmm12
+; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm3
+; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm7, %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm11, %xmm2
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm10, %zmm2
+; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm8, %zmm2
; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
-; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm2
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpermd %ymm29, %ymm3, %ymm14
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm19, %xmm12
-; AVX512BW-FCP-NEXT: vmovdqa64 %xmm19, %xmm16
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm20, %xmm15
-; AVX512BW-FCP-NEXT: vmovdqa64 %xmm20, %xmm17
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm23, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm28, %xmm29
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm29[0],xmm15[0],xmm29[1],xmm15[1],xmm29[2],xmm15[2],xmm29[3],xmm15[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
-; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm18, %zmm15
-; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
+; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm2
+; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm6, %ymm3
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm5
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm3, %ymm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm24, %ymm12, %ymm4
+; AVX512BW-FCP-NEXT: vpermd %ymm23, %ymm12, %ymm5
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm4, %xmm13
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm5, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
+; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm26, %zmm14
+; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm14 {%k1}
+; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm13
+; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm17, %ymm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
+; AVX512BW-FCP-NEXT: vpermd %ymm31, %ymm6, %ymm16
+; AVX512BW-FCP-NEXT: vpermd %ymm21, %ymm6, %ymm15
+; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm6
+; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm28, %ymm12, %ymm13
+; AVX512BW-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm18
+; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm13, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm18, %xmm12
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
+; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm8, %zmm12
+; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm12
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm12
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm3, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm4, %xmm12
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm26, %zmm14
+; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm12 {%k1}
-; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm19 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm19, %ymm9
-; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm18 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm3, %ymm21
-; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm3
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm21, %ymm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm22, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm8, %xmm13
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm10, %zmm11
-; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm29
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm9
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm11
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm16, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm17, %xmm13
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm23, %xmm20
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm27
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm20[0],xmm27[1],xmm20[1],xmm27[2],xmm20[2],xmm27[3],xmm20[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm12[3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27
-; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm25, %zmm12
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm10
+; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm17, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm16, %ymm14
+; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm15, %ymm9
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm13, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3]
+; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm8, %zmm14
+; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm1, %ymm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm10
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm12
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm4, %xmm10
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm5, %xmm12
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
+; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm26, %zmm12
; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm19, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm18, %ymm12
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm10 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm9
+; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm12
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm21, %ymm4
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm12
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm10, %zmm11
-; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm20
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm4
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm4
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm14, %ymm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm16, %xmm9
-; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm17, %xmm11
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm23, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm28, %xmm13
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm27, %zmm12
-; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm19, %ymm3
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6],ymm3[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm21, %ymm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm3[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm26, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm24, %xmm4
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm22, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3]
-; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm10, %zmm11
-; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3
+; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm12
+; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm12
+; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm18, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm8, %zmm14
+; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm1
+; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm5, %ymm1
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm2
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
+; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm2
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm16, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm4
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm23, %xmm9
-; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm28, %xmm11
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3]
-; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm27, %zmm9
-; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm19, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm18, %ymm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm15, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm21, %ymm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm26, %xmm5
-; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm24, %xmm1
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm22, %xmm5
-; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm4
-; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
-; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm10, %zmm4
-; AVX512BW-FCP-NEXT: vpmovqb %zmm4, %xmm4
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm4, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm5, %xmm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm26, %zmm2
+; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm17, %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm2
+; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm15, %ymm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm13, %xmm2
+; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm18, %xmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
+; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm8, %zmm3
+; AVX512BW-FCP-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rsi)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rdx)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rcx)
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%r9)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rax)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512BW-FCP-NEXT: addq $328, %rsp # imm = 0x148
+; AVX512BW-FCP-NEXT: addq $264, %rsp # imm = 0x108
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -18178,429 +17105,357 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf64:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: subq $328, %rsp # imm = 0x148
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: subq $264, %rsp # imm = 0x108
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm26
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
; AVX512DQ-BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm1
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm31
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm31, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm29
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 416(%rdi), %ymm22
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm20
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm20, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm27
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm3
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 368(%rdi), %xmm21
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm21, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 352(%rdi), %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 336(%rdi), %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm28
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm18, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm24, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm23
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm23, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm17, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm26, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19
; AVX512DQ-BW-FCP-NEXT: movb $-64, %al
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm17
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm16
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm27
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm16, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 112(%rdi), %xmm26
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm26, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 96(%rdi), %xmm24
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm24, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 80(%rdi), %xmm22
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm25
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm23
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm0, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 96(%rdi), %ymm28
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm28, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm25, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm31, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm12, %ymm13
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm15
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm21, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm21, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm23
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm15[0],xmm23[1],xmm15[1],xmm23[2],xmm15[2],xmm23[3],xmm15[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm12, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm25
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm20[0],xmm25[0],xmm20[1],xmm25[1],xmm20[2],xmm25[2],xmm20[3],xmm25[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm18, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm26, %zmm15
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm4, %ymm15
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm26, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm24, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm10, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm10, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm8, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm30, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm31, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm7, %xmm23
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm20[0],xmm15[0],xmm20[1],xmm15[1],xmm20[2],xmm15[2],xmm20[3],xmm15[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm18, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm17, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm26, %zmm15
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm17, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm3, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm15
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm10, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm8, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm31, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm19, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm20, %ymm12
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm9, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm5, %xmm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm18, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm17, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm16, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm26, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm24, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm22, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm17, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm26, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm7, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm11, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm10, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm8, %zmm2
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm29, %ymm3, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm19, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm19, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm20, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm20, %xmm17
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm23, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm28, %xmm29
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm29[0],xmm15[0],xmm29[1],xmm15[1],xmm29[2],xmm15[2],xmm29[3],xmm15[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm18, %zmm15
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm6, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm3, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm24, %ymm12, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm23, %ymm12, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm4, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm5, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm26, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm14 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm17, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm31, %ymm6, %ymm16
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm6, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm28, %ymm12, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm18
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm13, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm18, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm8, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm3, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm4, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm26, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm19 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm19, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm18 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm3, %ymm21
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm21, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm22, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm8, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm10, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm29
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm16, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm17, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm23, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm27
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm20[0],xmm27[1],xmm20[1],xmm27[2],xmm20[2],xmm27[3],xmm20[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm25, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm17, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm16, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm15, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm13, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm8, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm1, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm4, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm5, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm26, %zmm12
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm19, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm18, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm12
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm21, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm10, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm14, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm16, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm17, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm23, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm28, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm27, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm19, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm21, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm26, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm24, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm22, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm10, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm18, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm8, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm5, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm2
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm16, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm23, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm28, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm27, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm19, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm18, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm15, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm21, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm26, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm24, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm22, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm10, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm4, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm5, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm26, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm17, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm15, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm13, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm18, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm8, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rdx)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%r9)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rax)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-BW-FCP-NEXT: addq $328, %rsp # imm = 0x148
+; AVX512DQ-BW-FCP-NEXT: addq $264, %rsp # imm = 0x108
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <512 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 941b18db0931ad..f7a44fea5b02b9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -1185,451 +1185,429 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX2-LABEL: store_i8_stride5_vf16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-NEXT: vmovdqa (%rdx), %xmm3
-; AVX2-NEXT: vmovdqa (%rcx), %xmm4
-; AVX2-NEXT: vmovdqa (%r8), %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6
-; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[6],zero,zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9,25],zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28]
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero
-; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,ymm9[9],zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero,zero,zero
-; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[19,27],zero,zero,zero,ymm6[20,28],zero,zero,zero,ymm6[21,29],zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22]
-; AVX2-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-NEXT: vmovdqa (%rdx), %xmm2
+; AVX2-NEXT: vmovdqa (%r8), %xmm0
+; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28]
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero
+; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero
+; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
+; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero
+; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22]
+; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
+; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, (%r9)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
+; AVX2-NEXT: vpor %xmm0, %xmm4, %xmm0
; AVX2-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX2-NEXT: vmovdqa %ymm5, (%r9)
-; AVX2-NEXT: vmovdqa %ymm7, 32(%r9)
+; AVX2-NEXT: vmovdqa %ymm3, 32(%r9)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: store_i8_stride5_vf16:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3
-; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm4
-; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[6],zero,zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9,25],zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero
-; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,ymm9[9],zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero,zero,zero
-; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[19,27],zero,zero,zero,ymm6[20,28],zero,zero,zero,ymm6[21,29],zero,zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22]
-; AVX2-FP-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
-; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero
-; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
-; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
-; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
+; AVX2-FP-NEXT: vmovdqa (%r8), %xmm0
+; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero
+; AVX2-FP-NEXT: vpor %ymm4, %ymm3, %ymm3
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero
+; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
+; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
+; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero
+; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22]
+; AVX2-FP-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
+; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
+; AVX2-FP-NEXT: vmovdqa %ymm1, (%r9)
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
+; AVX2-FP-NEXT: vpor %xmm0, %xmm4, %xmm0
; AVX2-FP-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm7, 32(%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: store_i8_stride5_vf16:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
-; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3
-; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm4
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,ymm7[1,9],zero,zero,zero,ymm7[2,10],zero,zero,zero,ymm7[19,27],zero,zero,zero,ymm7[20,28],zero,zero,zero,ymm7[21,29],zero,zero,zero
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,ymm8[1,9],zero,zero,zero,ymm8[2,10],zero,zero,zero,ymm8[3,19],zero,zero,zero,ymm8[28,20],zero,zero,zero,ymm8[29,21],zero,zero,zero,ymm8[30,22]
-; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,0,1,1]
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7]
-; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm5[3,7],zero,zero,zero,ymm5[8,12],zero,zero,zero,ymm5[9,13],zero,zero,zero,ymm5[18,22],zero,zero,zero,ymm5[19,23],zero,zero,zero,ymm5[24,28],zero,zero
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7]
-; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6],zero,zero,zero,ymm6[3,7],zero,zero,zero,ymm6[8,12],zero,zero,zero,ymm6[9,17],zero,zero,zero,ymm6[22,18],zero,zero,zero,ymm6[23,19],zero,zero,zero,ymm6[24,28]
-; AVX2-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,1,2,2,2,2,2,2]
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
-; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
-; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
-; AVX2-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
-; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm0
+; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,13],zero,zero,zero,xmm3[6,14],zero,zero,zero,xmm3[7,15],zero
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero,zero,zero
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[19,27],zero,zero,zero,ymm4[20,28],zero,zero,zero,ymm4[21,29],zero,zero,zero
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,2,0]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22]
+; AVX2-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,0,1,1]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
+; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,2,6,3,7]
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[3,7],zero,zero,zero,ymm1[8,12],zero,zero,zero,ymm1[9,13],zero,zero,zero,ymm1[18,22],zero,zero,zero,ymm1[19,23],zero,zero,zero,ymm1[24,28],zero,zero
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,6],zero,zero,zero,ymm2[3,7],zero,zero,zero,ymm2[8,12],zero,zero,zero,ymm2[9,17],zero,zero,zero,ymm2[22,18],zero,zero,zero,ymm2[23,19],zero,zero,zero,ymm2[24,28]
+; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,1,2,2,2,2,2,2]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
+; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
+; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9)
+; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9)
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
+; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9)
-; AVX2-FCP-NEXT: vmovdqa %ymm7, (%r9)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: store_i8_stride5_vf16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa (%rsi), %xmm2
-; AVX512-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512-NEXT: vmovdqa (%rcx), %xmm4
-; AVX512-NEXT: vmovdqa (%r8), %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6
-; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,7],zero,ymm6[u,u,u,8],zero,ymm6[u,u,u,9],zero,ymm6[u,u,u],zero,ymm6[26,u,u,u],zero,ymm6[27,u,u,u],zero,ymm6[28,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[7,u,u,u],zero,ymm8[8,u,u,u],zero,ymm8[9,u,u,u,26],zero,ymm8[u,u,u,27],zero,ymm8[u,u,u,28],zero,ymm8[u,u]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm7)
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero
-; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,ymm5[u,u,u,7],zero,ymm5[u,u,u,8],zero,ymm5[u,u,u,9,25,u,u,u],zero,ymm5[26,u,u,u],zero,ymm5[27,u,u,u],zero,ymm5[28]
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6,u,u,u],zero,ymm5[7,u,u,u],zero,ymm5[8,u,u,u],zero,zero,ymm5[u,u,u,26],zero,ymm5[u,u,u,27],zero,ymm5[u,u,u,28],zero
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 & (ymm5 | ymm8)
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,1,9],zero,zero,ymm6[u,2,10],zero,zero,ymm6[u,3,19],zero,zero,ymm6[u,28,20],zero,zero,ymm6[u,29,21],zero,zero,ymm6[u,30,22]
-; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512-NEXT: vporq %zmm7, %zmm5, %zmm5
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
-; AVX512-NEXT: vpermd %zmm1, %zmm6, %zmm6
-; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u]
-; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12],zero,zero,zero,zero,xmm1[13],zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,xmm1[15]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512-NEXT: vmovdqa (%r8), %xmm0
+; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3)
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28]
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4)
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22]
+; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
+; AVX512-NEXT: vpermd %zmm0, %zmm4, %zmm4
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u]
+; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX512-NEXT: vmovdqa64 %zmm6, (%r9)
+; AVX512-NEXT: vmovdqa64 %zmm4, (%r9)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i8_stride5_vf16:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm4
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7]
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[u,3,7],zero,zero,ymm8[u,8,12],zero,zero,ymm8[u,9,13],zero,zero,ymm8[u,18,22],zero,zero,ymm8[u,19,23],zero,zero,ymm8[u,24,28],zero,zero
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,1,9],zero,zero,ymm5[u,2,10],zero,zero,ymm5[u,3,19],zero,zero,ymm5[u,28,20],zero,zero,ymm5[u,29,21],zero,zero,ymm5[u,30,22]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7]
-; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
-; AVX512-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6
-; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7]
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm4
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
+; AVX512-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4
+; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i8_stride5_vf16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm2
-; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm4
-; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,7],zero,ymm6[u,u,u,8],zero,ymm6[u,u,u,9],zero,ymm6[u,u,u],zero,ymm6[26,u,u,u],zero,ymm6[27,u,u,u],zero,ymm6[28,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[7,u,u,u],zero,ymm8[8,u,u,u],zero,ymm8[9,u,u,u,26],zero,ymm8[u,u,u,27],zero,ymm8[u,u,u,28],zero,ymm8[u,u]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm7)
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,ymm5[u,u,u,7],zero,ymm5[u,u,u,8],zero,ymm5[u,u,u,9,25,u,u,u],zero,ymm5[26,u,u,u],zero,ymm5[27,u,u,u],zero,ymm5[28]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6,u,u,u],zero,ymm5[7,u,u,u],zero,ymm5[8,u,u,u],zero,zero,ymm5[u,u,u,26],zero,ymm5[u,u,u,27],zero,ymm5[u,u,u,28],zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 & (ymm5 | ymm8)
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,1,9],zero,zero,ymm6[u,2,10],zero,zero,ymm6[u,3,19],zero,zero,ymm6[u,28,20],zero,zero,ymm6[u,29,21],zero,zero,ymm6[u,30,22]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-NEXT: vporq %zmm7, %zmm5, %zmm5
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
-; AVX512DQ-NEXT: vpermd %zmm1, %zmm6, %zmm6
-; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12],zero,zero,zero,zero,xmm1[13],zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,xmm1[15]
-; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0
+; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3)
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4)
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512DQ-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
+; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm4
+; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
+; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r9)
+; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r9)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i8_stride5_vf16:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[u,3,7],zero,zero,ymm8[u,8,12],zero,zero,ymm8[u,9,13],zero,zero,ymm8[u,18,22],zero,zero,ymm8[u,19,23],zero,zero,ymm8[u,24,28],zero,zero
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,1,9],zero,zero,ymm5[u,2,10],zero,zero,ymm5[u,3,19],zero,zero,ymm5[u,28,20],zero,zero,ymm5[u,29,21],zero,zero,ymm5[u,30,22]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6
-; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u]
+; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i8_stride5_vf16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
-; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512BW-NEXT: vmovdqa (%r8), %xmm4
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9],zero,zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28],zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,ymm8[9],zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero,zero,zero
-; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9,25],zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[6],zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero
-; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero
+; AVX512BW-NEXT: vpor %ymm4, %ymm3, %ymm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero
+; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4
; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm8 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[3,19],zero,zero,zero,ymm6[28,20],zero,zero,zero,ymm6[29,21],zero,zero,zero,ymm6[30,22]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2]
+; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero
-; AVX512BW-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
-; AVX512BW-NEXT: vpermd %zmm4, %zmm6, %zmm6
+; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
+; AVX512BW-NEXT: vpermd %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
-; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2
-; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9)
-; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9)
+; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
+; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1
+; AVX512BW-NEXT: vmovdqa %xmm2, 64(%r9)
+; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: store_i8_stride5_vf16:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
-; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm4
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7]
-; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[0,8],zero,zero,zero,zmm7[1,9],zero,zero,zero,zmm7[2,10],zero,zero,zero,zmm7[3,19],zero,zero,zero,zmm7[28,20],zero,zero,zero,zmm7[29,21],zero,zero,zero,zmm7[30,22,34,38],zero,zero,zero,zmm7[35,39],zero,zero,zero,zmm7[40,44],zero,zero,zero,zmm7[41,49],zero,zero,zero,zmm7[54,50],zero,zero,zero,zmm7[55,51],zero,zero,zero,zmm7[56,60]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7]
-; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zmm5[1,9],zero,zero,zero,zmm5[2,10],zero,zero,zero,zmm5[19,27],zero,zero,zero,zmm5[20,28],zero,zero,zero,zmm5[21,29],zero,zero,zero,zero,zero,zero,zmm5[35,39],zero,zero,zero,zmm5[40,44],zero,zero,zero,zmm5[41,45],zero,zero,zero,zmm5[50,54],zero,zero,zero,zmm5[51,55],zero,zero,zero,zmm5[56,60],zero,zero
-; AVX512BW-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
-; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,2,6,6,2,3,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero
+; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
+; AVX512BW-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4
; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AVX512BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
-; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2
-; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm2 | xmm1
+; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 64(%r9)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: store_i8_stride5_vf16:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
-; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm4
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9],zero,zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28],zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,ymm8[9],zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9,25],zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28]
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[6],zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero
-; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm3, %ymm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28]
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero
+; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4
; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318
; AVX512DQ-BW-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm8 {%k1}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[3,19],zero,zero,zero,ymm6[28,20],zero,zero,zero,ymm6[29,21],zero,zero,zero,ymm6[30,22]
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22]
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
-; AVX512DQ-BW-NEXT: vpermd %zmm4, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
+; AVX512DQ-BW-NEXT: vpermd %zmm2, %zmm4, %zmm4
; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AVX512DQ-BW-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
-; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2
-; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9)
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
+; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1
+; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 64(%r9)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf16:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm4
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[0,8],zero,zero,zero,zmm7[1,9],zero,zero,zero,zmm7[2,10],zero,zero,zero,zmm7[3,19],zero,zero,zero,zmm7[28,20],zero,zero,zero,zmm7[29,21],zero,zero,zero,zmm7[30,22,34,38],zero,zero,zero,zmm7[35,39],zero,zero,zero,zmm7[40,44],zero,zero,zero,zmm7[41,49],zero,zero,zero,zmm7[54,50],zero,zero,zero,zmm7[55,51],zero,zero,zero,zmm7[56,60]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zmm5[1,9],zero,zero,zero,zmm5[2,10],zero,zero,zero,zmm5[19,27],zero,zero,zero,zmm5[20,28],zero,zero,zero,zmm5[21,29],zero,zero,zero,zero,zero,zero,zmm5[35,39],zero,zero,zero,zmm5[40,44],zero,zero,zero,zmm5[41,45],zero,zero,zero,zmm5[50,54],zero,zero,zero,zmm5[51,55],zero,zero,zero,zmm5[56,60],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,2,6,6,2,3,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4
; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
-; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm2 | xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 64(%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 98a64ee987f7b0..ab968b91153a9e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -1813,81 +1813,79 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2: # %bb.0:
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-NEXT: vmovdqa (%rdx), %xmm5
-; AVX2-NEXT: vmovdqa (%rcx), %xmm6
-; AVX2-NEXT: vmovdqa (%r8), %xmm3
-; AVX2-NEXT: vmovdqa (%r9), %xmm4
-; AVX2-NEXT: vmovdqa (%r10), %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10
-; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero,zero,ymm11[25]
-; AVX2-NEXT: vpor %ymm9, %ymm11, %ymm9
-; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero,zero,zero
-; AVX2-NEXT: vpor %ymm12, %ymm11, %ymm11
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
-; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
-; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4],zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero
-; AVX2-NEXT: vpor %ymm12, %ymm11, %ymm11
-; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
-; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
-; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
-; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0]
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
-; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28]
-; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,2,0,2]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,8],zero,zero,zero,zero,zero,ymm13[1,9],zero,zero,zero,zero,zero,ymm13[2,10],zero,zero,zero,zero,zero,ymm13[19,27],zero,zero,zero,zero,zero,ymm13[20,28],zero,zero
-; AVX2-NEXT: vpor %ymm12, %ymm13, %ymm12
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11
-; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,5,5,6]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3]
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
-; AVX2-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10
-; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[3,19],zero,zero,zero,zero,zero,ymm8[28,20],zero,zero,zero,zero,zero,ymm8[29,21],zero
+; AVX2-NEXT: vmovdqa (%rdi), %xmm3
+; AVX2-NEXT: vmovdqa (%rdx), %xmm4
+; AVX2-NEXT: vmovdqa (%r8), %xmm1
+; AVX2-NEXT: vmovdqa (%r9), %xmm2
+; AVX2-NEXT: vmovdqa (%r10), %xmm0
+; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3
+; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6
+; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25]
+; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero
; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
-; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[12,13],zero,zero,zero,zero,zero,xmm5[14,15],zero,zero,zero
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero
-; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
+; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero
+; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7
+; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
+; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
+; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
+; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,2,0,2]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28]
+; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,0,2]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero
+; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7
+; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
+; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero
+; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
+; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero
+; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,2]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0]
-; AVX2-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
-; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,2]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0]
+; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %ymm6, 64(%rax)
+; AVX2-NEXT: vmovdqa %ymm7, (%rax)
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
+; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX2-NEXT: vmovdqa %ymm7, 64(%rax)
-; AVX2-NEXT: vmovdqa %ymm11, (%rax)
-; AVX2-NEXT: vmovdqa %ymm9, 32(%rax)
+; AVX2-NEXT: vmovdqa %ymm5, 32(%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1895,77 +1893,75 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP: # %bb.0:
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm4
-; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm5
-; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
-; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3
+; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2
+; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3
+; AVX2-FP-NEXT: vmovdqa (%r8), %xmm0
+; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1
+; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm2, %ymm2
+; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
-; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm10
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[5],zero,zero,zero,zero,zero,zero,ymm9[6],zero,zero,zero,zero,zero,ymm9[23],zero,zero,zero,zero,zero,zero,ymm9[24],zero,zero,zero,zero,zero,zero,ymm9[25]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25]
+; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero
+; AVX2-FP-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
+; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero
+; AVX2-FP-NEXT: vpor %ymm7, %ymm4, %ymm7
+; AVX2-FP-NEXT: vmovdqa (%r10), %xmm4
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
+; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
+; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
+; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero
; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpor %ymm11, %ymm9, %ymm9
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm9
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = zero,ymm11[4],zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero
-; AVX2-FP-NEXT: vpor %ymm11, %ymm8, %ymm11
-; AVX2-FP-NEXT: vmovdqa (%r10), %xmm8
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
-; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
-; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,2,0,2]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
-; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,2,0,2]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,8],zero,zero,zero,zero,zero,ymm13[1,9],zero,zero,zero,zero,zero,ymm13[2,10],zero,zero,zero,zero,zero,ymm13[19,27],zero,zero,zero,zero,zero,ymm13[20,28],zero,zero
-; AVX2-FP-NEXT: vpor %ymm12, %ymm13, %ymm12
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
-; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,1,1,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,zero,zero,zero,ymm6[10,2],zero,zero,zero,zero,zero,ymm6[11,3],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero,zero,zero,zero,ymm6[21,29],zero,zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[2,10],zero,zero,zero,zero,zero,ymm7[3,19],zero,zero,zero,zero,zero,ymm7[28,20],zero,zero,zero,zero,zero,ymm7[29,21],zero
-; AVX2-FP-NEXT: vpor %ymm6, %ymm7, %ymm6
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
-; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm6
-; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[12,13],zero,zero,zero,zero,zero,xmm4[14,15],zero,zero,zero
-; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpor %xmm4, %xmm0, %xmm0
-; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm8[13,14,15,4,5],zero,zero,xmm8[14,15,14,15,12],zero,zero,xmm8[15]
-; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
-; AVX2-FP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-FP-NEXT: vmovdqa %xmm0, 96(%rax)
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
+; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
+; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm2[3,1,1,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm3[1,3,3,1]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero
+; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
+; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[6,14],zero,zero,zero,zero,zero,xmm2[7,15],zero,zero,zero,zero,zero
+; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,zero,zero,zero,zero,xmm0[13,12],zero,zero,zero,zero,zero,xmm0[15,14],zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm4[13,14,15,4,5],zero,zero,xmm4[14,15,14,15,12],zero,zero,xmm4[15]
+; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rax)
-; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax)
-; AVX2-FP-NEXT: vmovdqa %ymm9, 32(%rax)
+; AVX2-FP-NEXT: vmovdqa %ymm7, (%rax)
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
+; AVX2-FP-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-FP-NEXT: vmovdqa %xmm0, 96(%rax)
+; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rax)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
@@ -1973,75 +1969,73 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm5
-; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm6
-; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm3
-; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm4
-; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm2
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[1,1,0,0,4,5,6,7]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm7
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm11, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,2,0,2]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,ymm11[0,8],zero,zero,zero,zero,zero,ymm11[1,9],zero,zero,zero,zero,zero,ymm11[18,26],zero,zero,zero,zero,zero,ymm11[19,27],zero,zero,zero,zero,zero,ymm11[20,28]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[2,10],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28],zero,zero
-; AVX2-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm11, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0]
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,5,2,6,1,5,2,6]
-; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm13
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
-; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vpermd %ymm9, %ymm12, %ymm13
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,ymm13[1,5],zero,zero,zero,zero,zero,ymm13[2,6],zero,zero,zero,zero,zero,ymm13[19,23],zero,zero,zero,zero,zero,ymm13[24,28],zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm12
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,ymm12[1,5],zero,zero,zero,zero,zero,ymm12[2,6],zero,zero,zero,zero,zero,ymm12[19,23],zero,zero,zero,zero,zero,ymm12[24,28],zero,zero,zero,zero,zero,ymm12[25]
-; AVX2-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,5,5,6]
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,2,3,3,2,2,3,3]
-; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3]
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm3
+; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm4
+; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1
+; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm2
+; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm0
+; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3
+; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,2,0,0,1]
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero
+; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
+; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0]
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,5,2,6,1,5,2,6]
+; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm9
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
+; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm7, %ymm7
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm9[1,5],zero,zero,zero,zero,zero,ymm9[2,6],zero,zero,zero,zero,zero,ymm9[19,23],zero,zero,zero,zero,zero,ymm9[24,28],zero,zero,zero,zero
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm8[1,5],zero,zero,zero,zero,zero,ymm8[2,6],zero,zero,zero,zero,zero,ymm8[19,23],zero,zero,zero,zero,zero,ymm8[24,28],zero,zero,zero,zero,zero,ymm8[25]
+; AVX2-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
+; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3]
+; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
+; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero
; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
-; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm10, %ymm8
-; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[12,13],zero,zero,zero,zero,zero,xmm5[14,15],zero,zero,zero
-; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,14,15,4,5],zero,zero,xmm2[14,15,14,15,12],zero,zero,xmm2[15]
-; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
-; AVX2-FCP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,14,15,4,5],zero,zero,xmm0[14,15,14,15,12],zero,zero,xmm0[15]
+; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX2-FCP-NEXT: vmovdqa %ymm6, 64(%rax)
+; AVX2-FCP-NEXT: vmovdqa %ymm7, 32(%rax)
+; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rax)
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
+; AVX2-FCP-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm8, 64(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rax)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
@@ -2049,76 +2043,74 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512: # %bb.0:
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512-NEXT: vmovdqa (%rdx), %xmm5
-; AVX512-NEXT: vmovdqa (%rcx), %xmm6
-; AVX512-NEXT: vmovdqa (%r8), %xmm3
-; AVX512-NEXT: vmovdqa (%r9), %xmm4
-; AVX512-NEXT: vmovdqa (%r10), %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7
-; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ~ymm12 & (ymm11 | ymm10)
-; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28]
-; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
-; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero
-; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm12 & (ymm13 | ymm11)
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero
-; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11
-; AVX512-NEXT: vporq %zmm10, %zmm11, %zmm10
-; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 & (ymm12 | ymm11)
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u]
-; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
-; AVX512-NEXT: vpandn %ymm12, %ymm13, %ymm12
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0]
-; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
-; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
-; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512-NEXT: vporq %zmm12, %zmm11, %zmm11
-; AVX512-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10))
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u]
-; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm9 & ~mem)
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm8))
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u]
-; AVX512-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512-NEXT: vmovdqa (%rdx), %xmm4
+; AVX512-NEXT: vmovdqa (%r8), %xmm1
+; AVX512-NEXT: vmovdqa (%r9), %xmm2
+; AVX512-NEXT: vmovdqa (%r10), %xmm0
+; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3
+; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5
+; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6)
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28]
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7)
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero
+; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512-NEXT: vporq %zmm6, %zmm7, %zmm6
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm9 & (ymm8 | ymm7)
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u]
+; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0]
+; AVX512-NEXT: vpandn %ymm8, %ymm9, %ymm8
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0]
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0]
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
+; AVX512-NEXT: vporq %zmm8, %zmm7, %zmm7
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6))
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u]
+; AVX512-NEXT: vpor %ymm6, %ymm8, %ymm6
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem)
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6))
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
+; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15]
-; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
-; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0
-; AVX512-NEXT: vmovdqa %xmm1, 96(%rax)
-; AVX512-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
+; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1
+; AVX512-NEXT: vmovdqa %xmm0, 96(%rax)
+; AVX512-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm7, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -2126,69 +2118,67 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6
-; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3
-; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm4
-; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28]
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6]
-; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u]
-; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm12 & mem)
-; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4
+; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1
+; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2
+; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm0
+; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3
+; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6]
+; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512-FCP-NEXT: vporq %zmm6, %zmm8, %zmm6
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,0,1,0,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm8 & mem)
+; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u]
-; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6]
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
-; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ~mem)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7))
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3]
+; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0
-; AVX512-FCP-NEXT: vmovdqa %xmm1, 96(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
-; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1
+; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -2196,76 +2186,74 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5
-; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6
-; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3
-; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4
-; AVX512DQ-NEXT: vmovdqa (%r10), %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
-; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ~ymm12 & (ymm11 | ymm10)
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm12 & (ymm13 | ymm11)
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11
-; AVX512DQ-NEXT: vporq %zmm10, %zmm11, %zmm10
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 & (ymm12 | ymm11)
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
-; AVX512DQ-NEXT: vpandn %ymm12, %ymm13, %ymm12
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512DQ-NEXT: vporq %zmm12, %zmm11, %zmm11
-; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u]
-; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm9 & ~mem)
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm8))
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u]
-; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4
+; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1
+; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2
+; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0
+; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3
+; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6)
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7)
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512DQ-NEXT: vporq %zmm6, %zmm7, %zmm6
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm9 & (ymm8 | ymm7)
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0]
+; AVX512DQ-NEXT: vpandn %ymm8, %ymm9, %ymm8
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0]
+; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
+; AVX512DQ-NEXT: vporq %zmm8, %zmm7, %zmm7
+; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u]
+; AVX512DQ-NEXT: vpor %ymm6, %ymm8, %ymm6
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem)
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
+; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15]
-; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0
-; AVX512DQ-NEXT: vmovdqa %xmm1, 96(%rax)
-; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
+; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1
+; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax)
+; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -2273,69 +2261,67 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6]
-; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512DQ-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm12 & mem)
-; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm0
+; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6]
+; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT: vporq %zmm6, %zmm8, %zmm6
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,0,1,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm8 & mem)
+; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
-; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ~mem)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3]
+; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15]
-; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 96(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
+; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -2343,82 +2329,80 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa (%rsi), %xmm3
-; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4
-; AVX512BW-NEXT: vmovdqa (%rcx), %xmm5
-; AVX512BW-NEXT: vmovdqa (%r8), %xmm6
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1
-; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm6, %ymm6
-; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm6, %zmm6
-; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm6, %ymm8
-; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,7,7,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,3,2]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,7,7,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2]
; AVX512BW-NEXT: movw $-32510, %cx # imm = 0x8102
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1}
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[12,13],zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[12,13],zero,zero,zero,zero,zero,xmm3[14,15],zero,zero,zero
-; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero
+; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5
; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm2 {%k1}
+; AVX512BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1}
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512BW-NEXT: vpermw %ymm8, %ymm3, %ymm3
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,1,3]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX512BW-NEXT: vpermw %ymm4, %ymm3, %ymm3
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
; AVX512BW-NEXT: movl $67637280, %ecx # imm = 0x4081020
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,3,1]
+; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,3,3,1]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[3,1,1,3]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,zero,zero,zero,ymm5[10,2],zero,zero,zero,zero,zero,ymm5[11,3],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero,zero,zero,zero,ymm5[21,29],zero,zero,zero
-; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
+; AVX512BW-NEXT: vpor %ymm3, %ymm7, %ymm3
; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1}
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
-; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1}
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
+; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm6
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero
-; AVX512BW-NEXT: vpor %ymm5, %ymm7, %ymm5
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
+; AVX512BW-NEXT: vpor %ymm4, %ymm7, %ymm4
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
; AVX512BW-NEXT: kmovq %rcx, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1}
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1]
+; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1}
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpor %ymm4, %ymm6, %ymm4
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25]
; AVX512BW-NEXT: vpor %ymm7, %ymm6, %ymm6
; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060
; AVX512BW-NEXT: kmovd %ecx, %k1
; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero
; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28]
-; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28]
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
; AVX512BW-NEXT: kmovq %rcx, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1}
-; AVX512BW-NEXT: vmovdqa %xmm2, 96(%rax)
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqa %xmm5, 96(%rax)
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax)
; AVX512BW-NEXT: vzeroupper
@@ -2428,72 +2412,70 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
-; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm4
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
-; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm4, %ymm4
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm4, %zmm4
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero
-; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm2
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15]
-; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,zero,zero,zero,zero,xmm4[13,12],zero,zero,zero,zero,zero,xmm4[15,14],zero
+; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[13],zero,zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,zero,xmm5[15]
+; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4
; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm4[1,3,1,3]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX512BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1}
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpermw %ymm5, %ymm4, %ymm4
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
; AVX512BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm6[1,3,3,1]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[3,1,1,3]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,3,1]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[3,19],zero,zero,zero,zero,zero,ymm4[28,20],zero,zero,zero,zero,zero,ymm4[29,21],zero
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm7, %ymm1
+; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4
; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1}
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
-; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,5,2,6,1,5,2,6]
-; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm7
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm4 {%k1}
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
+; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm5, %zmm6
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6]
+; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm7
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u]
; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
; AVX512BW-FCP-NEXT: kmovq %rcx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm4 {%k1}
-; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm3
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57]
-; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,2,0,2]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zmm2[33,37],zero,zero,zero,zero,zero,zmm2[34,38],zero,zero,zero,zero,zero,zmm2[51,55],zero,zero,zero,zero,zero,zmm2[56,60],zero,zero,zero,zero
-; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm6
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero,zero,zmm1[57]
+; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zmm0[20,28],zero,zero,zero,zero,zero,zmm0[33,37],zero,zero,zero,zero,zero,zmm0[34,38],zero,zero,zero,zero,zero,zmm0[51,55],zero,zero,zero,zero,zero,zmm0[56,60],zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
; AVX512BW-FCP-NEXT: kmovq %rcx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 96(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm4, 64(%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -2501,82 +2483,80 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4
-; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm5
-; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm6
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1
-; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm6, %ymm6
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm6, %zmm6
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
-; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm6, %ymm8
-; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,7,7,7]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,3,2]
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,7,7,7]
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2]
; AVX512DQ-BW-NEXT: movw $-32510, %cx # imm = 0x8102
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1}
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[12,13],zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[12,13],zero,zero,zero,zero,zero,xmm3[14,15],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5
; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %xmm7, %xmm2 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1}
; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpermw %ymm8, %ymm3, %ymm3
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,1,3]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX512DQ-BW-NEXT: vpermw %ymm4, %ymm3, %ymm3
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
; AVX512DQ-BW-NEXT: movl $67637280, %ecx # imm = 0x4081020
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,3,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,3,3,1]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[3,1,1,3]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,zero,zero,zero,ymm5[10,2],zero,zero,zero,zero,zero,ymm5[11,3],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero,zero,zero,zero,ymm5[21,29],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm7, %ymm3
; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1}
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
-; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1}
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
+; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm6
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm7, %ymm5
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
+; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm7, %ymm4
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
; AVX512DQ-BW-NEXT: kmovq %rcx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero
; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm6, %ymm4
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25]
; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm6, %ymm6
; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28]
-; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28]
+; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
; AVX512DQ-BW-NEXT: kmovq %rcx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 96(%rax)
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqa %xmm5, 96(%rax)
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax)
; AVX512DQ-BW-NEXT: vzeroupper
@@ -2586,72 +2566,70 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm4
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm4, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero
-; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,zero,zero,zero,zero,xmm4[13,12],zero,zero,zero,zero,zero,xmm4[15,14],zero
+; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[13],zero,zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,zero,xmm5[15]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4
; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm4[1,3,1,3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpermw %ymm5, %ymm4, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
; AVX512DQ-BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm6[1,3,3,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[3,1,1,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,3,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[3,19],zero,zero,zero,zero,zero,ymm4[28,20],zero,zero,zero,zero,zero,ymm4[29,21],zero
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm7, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4
; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
-; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,5,2,6,1,5,2,6]
-; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm4 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm5, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6]
+; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u]
; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,2,0,2]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zmm2[33,37],zero,zero,zero,zero,zero,zmm2[34,38],zero,zero,zero,zero,zero,zmm2[51,55],zero,zero,zero,zero,zero,zmm2[56,60],zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero,zero,zmm1[57]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zmm0[20,28],zero,zero,zero,zero,zero,zmm0[33,37],zero,zero,zero,zero,zero,zmm0[34,38],zero,zero,zero,zero,zero,zmm0[51,55],zero,zero,zero,zero,zero,zmm0[56,60],zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 96(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, 64(%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 9e82c84fe5520f..ec54b755135829 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -946,10 +946,8 @@ define <2 x i64> @PR116815(<4 x i64> %v0, <4 x i64> %v1) {
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $16, %ymm1, %ymm1
; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,2,6,10,14,u,u,u,u,u,u,u,u,16,20,24,28,18,22,26,30,u,u,u,u,u,u,u,u]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovq {{.*#+}} xmm2 = [0,4,8,12,2,6,10,14,0,0,0,0,0,0,0,0]
-; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 3d49edbb7bd8d2..3e76bffb77a665 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -483,50 +483,42 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
;
; AVX2-LABEL: interleaved_load_vf16_i8_stride4:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
-; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vmovdqa (%rdi), %ymm2
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4]
+; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX2-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
-; AVX2-NEXT: vpxor %xmm0, %xmm4, %xmm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
+; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX2-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
@@ -646,76 +638,66 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
;
; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
-; AVX2-NEXT: vmovdqa (%rdi), %xmm2
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7
-; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8
-; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm9
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3]
-; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4]
-; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
-; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7
-; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
-; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm6
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4]
+; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6
+; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm7
+; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm7
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm5
+; AVX2-NEXT: vpermd %ymm5, %ymm2, %ymm5
+; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm7
+; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm8
+; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm8
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
-; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
-; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
-; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
-; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9
-; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6
+; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm7
+; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7
+; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm8
+; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm8
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
-; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10
-; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10
-; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9
-; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
-; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0
-; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm7, %ymm0
+; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6
+; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
+; AVX2-NEXT: vpermd %ymm4, %ymm2, %ymm4
+; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vpcmpeqb %ymm0, %ymm6, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
More information about the llvm-commits
mailing list